fats-fme commited on
Commit
b0041c6
1 Parent(s): 6999d3e

Training in progress, step 249, checkpoint

Browse files
last-checkpoint/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f9e7aff42b36fe14e95ece06193160112474b8a29fc3680ce273c922ca5686f6
3
  size 216151256
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:12fef18f99c9bf3ec9eedf986ccbe12d2b84ec11b66e3b9788ae7ec43065b3d7
3
  size 216151256
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:2e21f976a284dd81c64396ec6b6206079943029f7c09ac486e503562b06e47e6
3
  size 432640054
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:541a7b0bf8fbbc06854b3c570354a27505018117b0f0d67f11955711b3bef1b4
3
  size 432640054
last-checkpoint/rng_state_0.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:bb3607b5839cda7054779e8f957cbf2db3456879873cc4e34eac04cbf33f5db8
3
  size 14512
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:341f7be18cc89c2ad2dec55ac567729ae8e5db65bdf39a5bc196fbd79e2cbf16
3
  size 14512
last-checkpoint/rng_state_1.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:fd635f6ad590a43a7a075b3fb4377adaa95cf2d835f115014607cf181d2b6449
3
  size 14512
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d870e1e8472cc5e0d2cb8fe273473f96e2cec9ffdf9fce6a51cdd5b21cd3bae6
3
  size 14512
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e3070c5337425657c2fec031251a5e4e8042c43dd7a5d4d7f77fa453b02282be
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3f919157faf64362df2e66ee2a7671eb7f6cf8287caadba981f351111997c856
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 0.5001883239171375,
5
  "eval_steps": 83,
6
- "global_step": 166,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -1193,6 +1193,595 @@
1193
  "eval_samples_per_second": 6.002,
1194
  "eval_steps_per_second": 1.503,
1195
  "step": 166
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1196
  }
1197
  ],
1198
  "logging_steps": 1,
@@ -1212,7 +1801,7 @@
1212
  "attributes": {}
1213
  }
1214
  },
1215
- "total_flos": 5.46878716765012e+17,
1216
  "train_batch_size": 2,
1217
  "trial_name": null,
1218
  "trial_params": null
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 0.7502824858757062,
5
  "eval_steps": 83,
6
+ "global_step": 249,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
1193
  "eval_samples_per_second": 6.002,
1194
  "eval_steps_per_second": 1.503,
1195
  "step": 166
1196
+ },
1197
+ {
1198
+ "epoch": 0.5032015065913371,
1199
+ "grad_norm": 1.2678625583648682,
1200
+ "learning_rate": 6.298594285815584e-05,
1201
+ "loss": 0.2982,
1202
+ "step": 167
1203
+ },
1204
+ {
1205
+ "epoch": 0.5062146892655367,
1206
+ "grad_norm": 1.2661633491516113,
1207
+ "learning_rate": 6.244532285066382e-05,
1208
+ "loss": 0.3381,
1209
+ "step": 168
1210
+ },
1211
+ {
1212
+ "epoch": 0.5092278719397364,
1213
+ "grad_norm": 1.160649061203003,
1214
+ "learning_rate": 6.190314727607196e-05,
1215
+ "loss": 0.3428,
1216
+ "step": 169
1217
+ },
1218
+ {
1219
+ "epoch": 0.512241054613936,
1220
+ "grad_norm": 2.1445140838623047,
1221
+ "learning_rate": 6.13594839020466e-05,
1222
+ "loss": 0.6844,
1223
+ "step": 170
1224
+ },
1225
+ {
1226
+ "epoch": 0.5152542372881356,
1227
+ "grad_norm": 1.9892338514328003,
1228
+ "learning_rate": 6.0814400682217234e-05,
1229
+ "loss": 0.559,
1230
+ "step": 171
1231
+ },
1232
+ {
1233
+ "epoch": 0.5182674199623352,
1234
+ "grad_norm": 3.0277585983276367,
1235
+ "learning_rate": 6.026796574768288e-05,
1236
+ "loss": 0.6495,
1237
+ "step": 172
1238
+ },
1239
+ {
1240
+ "epoch": 0.5212806026365349,
1241
+ "grad_norm": 3.0984301567077637,
1242
+ "learning_rate": 5.972024739849622e-05,
1243
+ "loss": 0.4114,
1244
+ "step": 173
1245
+ },
1246
+ {
1247
+ "epoch": 0.5242937853107345,
1248
+ "grad_norm": 3.296858549118042,
1249
+ "learning_rate": 5.917131409512663e-05,
1250
+ "loss": 0.5272,
1251
+ "step": 174
1252
+ },
1253
+ {
1254
+ "epoch": 0.527306967984934,
1255
+ "grad_norm": 4.093991756439209,
1256
+ "learning_rate": 5.862123444990318e-05,
1257
+ "loss": 0.5134,
1258
+ "step": 175
1259
+ },
1260
+ {
1261
+ "epoch": 0.5303201506591337,
1262
+ "grad_norm": 1.48560631275177,
1263
+ "learning_rate": 5.807007721843861e-05,
1264
+ "loss": 0.5482,
1265
+ "step": 176
1266
+ },
1267
+ {
1268
+ "epoch": 0.5333333333333333,
1269
+ "grad_norm": 1.6580932140350342,
1270
+ "learning_rate": 5.751791129103544e-05,
1271
+ "loss": 0.5894,
1272
+ "step": 177
1273
+ },
1274
+ {
1275
+ "epoch": 0.5363465160075329,
1276
+ "grad_norm": 1.1557704210281372,
1277
+ "learning_rate": 5.696480568407523e-05,
1278
+ "loss": 0.4388,
1279
+ "step": 178
1280
+ },
1281
+ {
1282
+ "epoch": 0.5393596986817326,
1283
+ "grad_norm": 1.1503976583480835,
1284
+ "learning_rate": 5.6410829531392006e-05,
1285
+ "loss": 0.3841,
1286
+ "step": 179
1287
+ },
1288
+ {
1289
+ "epoch": 0.5423728813559322,
1290
+ "grad_norm": 1.0935317277908325,
1291
+ "learning_rate": 5.585605207563124e-05,
1292
+ "loss": 0.38,
1293
+ "step": 180
1294
+ },
1295
+ {
1296
+ "epoch": 0.5453860640301318,
1297
+ "grad_norm": 1.0500688552856445,
1298
+ "learning_rate": 5.5300542659594854e-05,
1299
+ "loss": 0.37,
1300
+ "step": 181
1301
+ },
1302
+ {
1303
+ "epoch": 0.5483992467043315,
1304
+ "grad_norm": 0.910677433013916,
1305
+ "learning_rate": 5.47443707175741e-05,
1306
+ "loss": 0.335,
1307
+ "step": 182
1308
+ },
1309
+ {
1310
+ "epoch": 0.5514124293785311,
1311
+ "grad_norm": 1.0660496950149536,
1312
+ "learning_rate": 5.418760576667071e-05,
1313
+ "loss": 0.3516,
1314
+ "step": 183
1315
+ },
1316
+ {
1317
+ "epoch": 0.5544256120527307,
1318
+ "grad_norm": 0.8869411945343018,
1319
+ "learning_rate": 5.3630317398107864e-05,
1320
+ "loss": 0.3249,
1321
+ "step": 184
1322
+ },
1323
+ {
1324
+ "epoch": 0.5574387947269304,
1325
+ "grad_norm": 1.6419062614440918,
1326
+ "learning_rate": 5.3072575268531835e-05,
1327
+ "loss": 0.3531,
1328
+ "step": 185
1329
+ },
1330
+ {
1331
+ "epoch": 0.56045197740113,
1332
+ "grad_norm": 0.8144354820251465,
1333
+ "learning_rate": 5.2514449091305375e-05,
1334
+ "loss": 0.3002,
1335
+ "step": 186
1336
+ },
1337
+ {
1338
+ "epoch": 0.5634651600753295,
1339
+ "grad_norm": 1.1171809434890747,
1340
+ "learning_rate": 5.195600862779421e-05,
1341
+ "loss": 0.3776,
1342
+ "step": 187
1343
+ },
1344
+ {
1345
+ "epoch": 0.5664783427495292,
1346
+ "grad_norm": 1.14911687374115,
1347
+ "learning_rate": 5.139732367864736e-05,
1348
+ "loss": 0.3215,
1349
+ "step": 188
1350
+ },
1351
+ {
1352
+ "epoch": 0.5694915254237288,
1353
+ "grad_norm": 0.928807258605957,
1354
+ "learning_rate": 5.083846407507263e-05,
1355
+ "loss": 0.2747,
1356
+ "step": 189
1357
+ },
1358
+ {
1359
+ "epoch": 0.5725047080979284,
1360
+ "grad_norm": 1.024272084236145,
1361
+ "learning_rate": 5.0279499670108245e-05,
1362
+ "loss": 0.3024,
1363
+ "step": 190
1364
+ },
1365
+ {
1366
+ "epoch": 0.5755178907721281,
1367
+ "grad_norm": 1.2534008026123047,
1368
+ "learning_rate": 4.972050032989175e-05,
1369
+ "loss": 0.3704,
1370
+ "step": 191
1371
+ },
1372
+ {
1373
+ "epoch": 0.5785310734463277,
1374
+ "grad_norm": 0.9777889847755432,
1375
+ "learning_rate": 4.9161535924927374e-05,
1376
+ "loss": 0.271,
1377
+ "step": 192
1378
+ },
1379
+ {
1380
+ "epoch": 0.5815442561205273,
1381
+ "grad_norm": 1.0151127576828003,
1382
+ "learning_rate": 4.860267632135265e-05,
1383
+ "loss": 0.3179,
1384
+ "step": 193
1385
+ },
1386
+ {
1387
+ "epoch": 0.5845574387947269,
1388
+ "grad_norm": 0.9281553030014038,
1389
+ "learning_rate": 4.80439913722058e-05,
1390
+ "loss": 0.2508,
1391
+ "step": 194
1392
+ },
1393
+ {
1394
+ "epoch": 0.5875706214689266,
1395
+ "grad_norm": 1.6091618537902832,
1396
+ "learning_rate": 4.748555090869464e-05,
1397
+ "loss": 0.3701,
1398
+ "step": 195
1399
+ },
1400
+ {
1401
+ "epoch": 0.5905838041431262,
1402
+ "grad_norm": 2.309112548828125,
1403
+ "learning_rate": 4.692742473146818e-05,
1404
+ "loss": 0.5701,
1405
+ "step": 196
1406
+ },
1407
+ {
1408
+ "epoch": 0.5935969868173258,
1409
+ "grad_norm": 1.9035515785217285,
1410
+ "learning_rate": 4.636968260189214e-05,
1411
+ "loss": 0.5654,
1412
+ "step": 197
1413
+ },
1414
+ {
1415
+ "epoch": 0.5966101694915255,
1416
+ "grad_norm": 2.222712516784668,
1417
+ "learning_rate": 4.5812394233329305e-05,
1418
+ "loss": 0.51,
1419
+ "step": 198
1420
+ },
1421
+ {
1422
+ "epoch": 0.599623352165725,
1423
+ "grad_norm": 3.1770148277282715,
1424
+ "learning_rate": 4.525562928242592e-05,
1425
+ "loss": 0.6322,
1426
+ "step": 199
1427
+ },
1428
+ {
1429
+ "epoch": 0.6026365348399246,
1430
+ "grad_norm": 3.6650121212005615,
1431
+ "learning_rate": 4.4699457340405164e-05,
1432
+ "loss": 0.4471,
1433
+ "step": 200
1434
+ },
1435
+ {
1436
+ "epoch": 0.6056497175141243,
1437
+ "grad_norm": 1.350001573562622,
1438
+ "learning_rate": 4.414394792436877e-05,
1439
+ "loss": 0.4445,
1440
+ "step": 201
1441
+ },
1442
+ {
1443
+ "epoch": 0.6086629001883239,
1444
+ "grad_norm": 1.3186067342758179,
1445
+ "learning_rate": 4.3589170468607985e-05,
1446
+ "loss": 0.4268,
1447
+ "step": 202
1448
+ },
1449
+ {
1450
+ "epoch": 0.6116760828625235,
1451
+ "grad_norm": 1.0820094347000122,
1452
+ "learning_rate": 4.3035194315924785e-05,
1453
+ "loss": 0.349,
1454
+ "step": 203
1455
+ },
1456
+ {
1457
+ "epoch": 0.6146892655367232,
1458
+ "grad_norm": 1.016855001449585,
1459
+ "learning_rate": 4.248208870896456e-05,
1460
+ "loss": 0.3657,
1461
+ "step": 204
1462
+ },
1463
+ {
1464
+ "epoch": 0.6177024482109228,
1465
+ "grad_norm": 0.7627567648887634,
1466
+ "learning_rate": 4.192992278156141e-05,
1467
+ "loss": 0.282,
1468
+ "step": 205
1469
+ },
1470
+ {
1471
+ "epoch": 0.6207156308851224,
1472
+ "grad_norm": 0.8734930753707886,
1473
+ "learning_rate": 4.1378765550096835e-05,
1474
+ "loss": 0.3205,
1475
+ "step": 206
1476
+ },
1477
+ {
1478
+ "epoch": 0.6237288135593221,
1479
+ "grad_norm": 0.9233391880989075,
1480
+ "learning_rate": 4.082868590487339e-05,
1481
+ "loss": 0.2682,
1482
+ "step": 207
1483
+ },
1484
+ {
1485
+ "epoch": 0.6267419962335217,
1486
+ "grad_norm": 0.7341740131378174,
1487
+ "learning_rate": 4.027975260150381e-05,
1488
+ "loss": 0.2488,
1489
+ "step": 208
1490
+ },
1491
+ {
1492
+ "epoch": 0.6297551789077213,
1493
+ "grad_norm": 0.8570201992988586,
1494
+ "learning_rate": 3.973203425231715e-05,
1495
+ "loss": 0.2644,
1496
+ "step": 209
1497
+ },
1498
+ {
1499
+ "epoch": 0.632768361581921,
1500
+ "grad_norm": 0.8284196853637695,
1501
+ "learning_rate": 3.918559931778277e-05,
1502
+ "loss": 0.3093,
1503
+ "step": 210
1504
+ },
1505
+ {
1506
+ "epoch": 0.6357815442561205,
1507
+ "grad_norm": 0.7870326638221741,
1508
+ "learning_rate": 3.8640516097953405e-05,
1509
+ "loss": 0.2577,
1510
+ "step": 211
1511
+ },
1512
+ {
1513
+ "epoch": 0.6387947269303201,
1514
+ "grad_norm": 0.9884381294250488,
1515
+ "learning_rate": 3.809685272392804e-05,
1516
+ "loss": 0.3252,
1517
+ "step": 212
1518
+ },
1519
+ {
1520
+ "epoch": 0.6418079096045197,
1521
+ "grad_norm": 1.1668404340744019,
1522
+ "learning_rate": 3.755467714933619e-05,
1523
+ "loss": 0.3414,
1524
+ "step": 213
1525
+ },
1526
+ {
1527
+ "epoch": 0.6448210922787194,
1528
+ "grad_norm": 1.1960536241531372,
1529
+ "learning_rate": 3.701405714184416e-05,
1530
+ "loss": 0.3029,
1531
+ "step": 214
1532
+ },
1533
+ {
1534
+ "epoch": 0.647834274952919,
1535
+ "grad_norm": 0.7987526059150696,
1536
+ "learning_rate": 3.647506027468467e-05,
1537
+ "loss": 0.2501,
1538
+ "step": 215
1539
+ },
1540
+ {
1541
+ "epoch": 0.6508474576271186,
1542
+ "grad_norm": 1.0721232891082764,
1543
+ "learning_rate": 3.59377539182107e-05,
1544
+ "loss": 0.3132,
1545
+ "step": 216
1546
+ },
1547
+ {
1548
+ "epoch": 0.6538606403013183,
1549
+ "grad_norm": 0.9739212393760681,
1550
+ "learning_rate": 3.5402205231474736e-05,
1551
+ "loss": 0.2644,
1552
+ "step": 217
1553
+ },
1554
+ {
1555
+ "epoch": 0.6568738229755179,
1556
+ "grad_norm": 1.1942991018295288,
1557
+ "learning_rate": 3.486848115383445e-05,
1558
+ "loss": 0.3206,
1559
+ "step": 218
1560
+ },
1561
+ {
1562
+ "epoch": 0.6598870056497175,
1563
+ "grad_norm": 1.3838952779769897,
1564
+ "learning_rate": 3.4336648396585776e-05,
1565
+ "loss": 0.3569,
1566
+ "step": 219
1567
+ },
1568
+ {
1569
+ "epoch": 0.6629001883239172,
1570
+ "grad_norm": 2.448126792907715,
1571
+ "learning_rate": 3.380677343462447e-05,
1572
+ "loss": 0.5818,
1573
+ "step": 220
1574
+ },
1575
+ {
1576
+ "epoch": 0.6659133709981168,
1577
+ "grad_norm": 1.8376816511154175,
1578
+ "learning_rate": 3.327892249813745e-05,
1579
+ "loss": 0.4343,
1580
+ "step": 221
1581
+ },
1582
+ {
1583
+ "epoch": 0.6689265536723163,
1584
+ "grad_norm": 2.102494478225708,
1585
+ "learning_rate": 3.275316156432434e-05,
1586
+ "loss": 0.4626,
1587
+ "step": 222
1588
+ },
1589
+ {
1590
+ "epoch": 0.671939736346516,
1591
+ "grad_norm": 2.167078733444214,
1592
+ "learning_rate": 3.2229556349150945e-05,
1593
+ "loss": 0.4407,
1594
+ "step": 223
1595
+ },
1596
+ {
1597
+ "epoch": 0.6749529190207156,
1598
+ "grad_norm": 2.149308204650879,
1599
+ "learning_rate": 3.170817229913526e-05,
1600
+ "loss": 0.3198,
1601
+ "step": 224
1602
+ },
1603
+ {
1604
+ "epoch": 0.6779661016949152,
1605
+ "grad_norm": 4.306909084320068,
1606
+ "learning_rate": 3.118907458316722e-05,
1607
+ "loss": 0.5187,
1608
+ "step": 225
1609
+ },
1610
+ {
1611
+ "epoch": 0.6809792843691149,
1612
+ "grad_norm": 1.0858993530273438,
1613
+ "learning_rate": 3.067232808436299e-05,
1614
+ "loss": 0.3973,
1615
+ "step": 226
1616
+ },
1617
+ {
1618
+ "epoch": 0.6839924670433145,
1619
+ "grad_norm": 0.9466880559921265,
1620
+ "learning_rate": 3.0157997391955172e-05,
1621
+ "loss": 0.2911,
1622
+ "step": 227
1623
+ },
1624
+ {
1625
+ "epoch": 0.6870056497175141,
1626
+ "grad_norm": 1.053808569908142,
1627
+ "learning_rate": 2.964614679321966e-05,
1628
+ "loss": 0.3425,
1629
+ "step": 228
1630
+ },
1631
+ {
1632
+ "epoch": 0.6900188323917138,
1633
+ "grad_norm": 1.020087718963623,
1634
+ "learning_rate": 2.913684026544021e-05,
1635
+ "loss": 0.3171,
1636
+ "step": 229
1637
+ },
1638
+ {
1639
+ "epoch": 0.6930320150659134,
1640
+ "grad_norm": 0.9112816452980042,
1641
+ "learning_rate": 2.8630141467911775e-05,
1642
+ "loss": 0.289,
1643
+ "step": 230
1644
+ },
1645
+ {
1646
+ "epoch": 0.696045197740113,
1647
+ "grad_norm": 0.9472637176513672,
1648
+ "learning_rate": 2.812611373398365e-05,
1649
+ "loss": 0.2909,
1650
+ "step": 231
1651
+ },
1652
+ {
1653
+ "epoch": 0.6990583804143126,
1654
+ "grad_norm": 0.8144400715827942,
1655
+ "learning_rate": 2.762482006314324e-05,
1656
+ "loss": 0.2527,
1657
+ "step": 232
1658
+ },
1659
+ {
1660
+ "epoch": 0.7020715630885123,
1661
+ "grad_norm": 0.8899109363555908,
1662
+ "learning_rate": 2.712632311314165e-05,
1663
+ "loss": 0.2814,
1664
+ "step": 233
1665
+ },
1666
+ {
1667
+ "epoch": 0.7050847457627119,
1668
+ "grad_norm": 0.8338634967803955,
1669
+ "learning_rate": 2.6630685192161992e-05,
1670
+ "loss": 0.2684,
1671
+ "step": 234
1672
+ },
1673
+ {
1674
+ "epoch": 0.7080979284369114,
1675
+ "grad_norm": 1.1928447484970093,
1676
+ "learning_rate": 2.6137968251031287e-05,
1677
+ "loss": 0.327,
1678
+ "step": 235
1679
+ },
1680
+ {
1681
+ "epoch": 0.7111111111111111,
1682
+ "grad_norm": 0.9519332647323608,
1683
+ "learning_rate": 2.5648233875477157e-05,
1684
+ "loss": 0.2797,
1685
+ "step": 236
1686
+ },
1687
+ {
1688
+ "epoch": 0.7141242937853107,
1689
+ "grad_norm": 1.1364781856536865,
1690
+ "learning_rate": 2.5161543278430054e-05,
1691
+ "loss": 0.3121,
1692
+ "step": 237
1693
+ },
1694
+ {
1695
+ "epoch": 0.7171374764595103,
1696
+ "grad_norm": 1.0575398206710815,
1697
+ "learning_rate": 2.4677957292372167e-05,
1698
+ "loss": 0.2866,
1699
+ "step": 238
1700
+ },
1701
+ {
1702
+ "epoch": 0.72015065913371,
1703
+ "grad_norm": 0.7296847105026245,
1704
+ "learning_rate": 2.419753636173379e-05,
1705
+ "loss": 0.2432,
1706
+ "step": 239
1707
+ },
1708
+ {
1709
+ "epoch": 0.7231638418079096,
1710
+ "grad_norm": 0.7573246955871582,
1711
+ "learning_rate": 2.3720340535338348e-05,
1712
+ "loss": 0.2545,
1713
+ "step": 240
1714
+ },
1715
+ {
1716
+ "epoch": 0.7261770244821092,
1717
+ "grad_norm": 0.8128641247749329,
1718
+ "learning_rate": 2.3246429458896634e-05,
1719
+ "loss": 0.2548,
1720
+ "step": 241
1721
+ },
1722
+ {
1723
+ "epoch": 0.7291902071563089,
1724
+ "grad_norm": 0.8031213283538818,
1725
+ "learning_rate": 2.2775862367551644e-05,
1726
+ "loss": 0.2509,
1727
+ "step": 242
1728
+ },
1729
+ {
1730
+ "epoch": 0.7322033898305085,
1731
+ "grad_norm": 0.9279189109802246,
1732
+ "learning_rate": 2.2308698078474645e-05,
1733
+ "loss": 0.264,
1734
+ "step": 243
1735
+ },
1736
+ {
1737
+ "epoch": 0.7352165725047081,
1738
+ "grad_norm": 1.0536640882492065,
1739
+ "learning_rate": 2.1844994983513467e-05,
1740
+ "loss": 0.3295,
1741
+ "step": 244
1742
+ },
1743
+ {
1744
+ "epoch": 0.7382297551789078,
1745
+ "grad_norm": 2.177769422531128,
1746
+ "learning_rate": 2.1384811041894055e-05,
1747
+ "loss": 0.4975,
1748
+ "step": 245
1749
+ },
1750
+ {
1751
+ "epoch": 0.7412429378531074,
1752
+ "grad_norm": 1.5739905834197998,
1753
+ "learning_rate": 2.0928203772975917e-05,
1754
+ "loss": 0.4395,
1755
+ "step": 246
1756
+ },
1757
+ {
1758
+ "epoch": 0.7442561205273069,
1759
+ "grad_norm": 1.7161879539489746,
1760
+ "learning_rate": 2.0475230249062725e-05,
1761
+ "loss": 0.3947,
1762
+ "step": 247
1763
+ },
1764
+ {
1765
+ "epoch": 0.7472693032015066,
1766
+ "grad_norm": 3.185561180114746,
1767
+ "learning_rate": 2.0025947088268717e-05,
1768
+ "loss": 0.6166,
1769
+ "step": 248
1770
+ },
1771
+ {
1772
+ "epoch": 0.7502824858757062,
1773
+ "grad_norm": 2.201467275619507,
1774
+ "learning_rate": 1.958041044744186e-05,
1775
+ "loss": 0.3779,
1776
+ "step": 249
1777
+ },
1778
+ {
1779
+ "epoch": 0.7502824858757062,
1780
+ "eval_loss": NaN,
1781
+ "eval_runtime": 92.9008,
1782
+ "eval_samples_per_second": 6.017,
1783
+ "eval_steps_per_second": 1.507,
1784
+ "step": 249
1785
  }
1786
  ],
1787
  "logging_steps": 1,
 
1801
  "attributes": {}
1802
  }
1803
  },
1804
+ "total_flos": 8.20318075147518e+17,
1805
  "train_batch_size": 2,
1806
  "trial_name": null,
1807
  "trial_params": null