cocolalala commited on
Commit
c1a75b8
1 Parent(s): 85b6d28

Model save

Browse files
README.md CHANGED
@@ -17,6 +17,8 @@ should probably proofread and complete it, then remove this comment. -->
17
  # asset-generation-sft-qlora
18
 
19
  This model was trained from scratch on the generator dataset.
 
 
20
 
21
  ## Model description
22
 
@@ -40,15 +42,20 @@ The following hyperparameters were used during training:
40
  - eval_batch_size: 32
41
  - seed: 42
42
  - distributed_type: multi-GPU
 
43
  - gradient_accumulation_steps: 2
44
- - total_train_batch_size: 32
 
45
  - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
46
  - lr_scheduler_type: cosine
47
  - lr_scheduler_warmup_ratio: 0.1
48
- - num_epochs: 0
49
 
50
  ### Training results
51
 
 
 
 
52
 
53
 
54
  ### Framework versions
 
17
  # asset-generation-sft-qlora
18
 
19
  This model was trained from scratch on the generator dataset.
20
+ It achieves the following results on the evaluation set:
21
+ - Loss: 0.7983
22
 
23
  ## Model description
24
 
 
42
  - eval_batch_size: 32
43
  - seed: 42
44
  - distributed_type: multi-GPU
45
+ - num_devices: 2
46
  - gradient_accumulation_steps: 2
47
+ - total_train_batch_size: 64
48
+ - total_eval_batch_size: 64
49
  - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
50
  - lr_scheduler_type: cosine
51
  - lr_scheduler_warmup_ratio: 0.1
52
+ - num_epochs: 1
53
 
54
  ### Training results
55
 
56
+ | Training Loss | Epoch | Step | Validation Loss |
57
+ |:-------------:|:-----:|:----:|:---------------:|
58
+ | 0.8288 | 1.0 | 5088 | 0.7983 |
59
 
60
 
61
  ### Framework versions
all_results.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
- "epoch": 0.8844339622641509,
3
- "total_flos": 1.393076937424896e+19,
4
- "train_loss": 0.0,
5
- "train_runtime": 0.0091,
6
- "train_samples": 100,
7
- "train_samples_per_second": 0.0,
8
- "train_steps_per_second": 0.0
9
  }
 
1
  {
2
+ "epoch": 1.0,
3
+ "total_flos": 1.5751056572484157e+19,
4
+ "train_loss": 0.09284130807192821,
5
+ "train_runtime": 20560.1048,
6
+ "train_samples": 1055292,
7
+ "train_samples_per_second": 15.837,
8
+ "train_steps_per_second": 0.247
9
  }
runs/May25_13-55-16_br1t43-s3-25/events.out.tfevents.1716645331.br1t43-s3-25.187086.0 CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:779d91afd5197d0f286a91734150e54d762ff168e7a61a2096656d93bb1907a8
3
- size 26553
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:346fafda02c3a931a24d9615d6f5811e351bbaabbefbf31491855a578e6ea4f9
3
+ size 30765
train_results.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
- "epoch": 0.8844339622641509,
3
- "total_flos": 1.393076937424896e+19,
4
- "train_loss": 0.0,
5
- "train_runtime": 0.0091,
6
- "train_samples": 100,
7
- "train_samples_per_second": 0.0,
8
- "train_steps_per_second": 0.0
9
  }
 
1
  {
2
+ "epoch": 1.0,
3
+ "total_flos": 1.5751056572484157e+19,
4
+ "train_loss": 0.09284130807192821,
5
+ "train_runtime": 20560.1048,
6
+ "train_samples": 1055292,
7
+ "train_samples_per_second": 15.837,
8
+ "train_steps_per_second": 0.247
9
  }
trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 0.8844339622641509,
5
  "eval_steps": 500,
6
- "global_step": 4500,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -6316,19 +6316,846 @@
6316
  "step": 4500
6317
  },
6318
  {
6319
- "epoch": 0.8844339622641509,
6320
- "step": 4500,
6321
- "total_flos": 1.393076937424896e+19,
6322
- "train_loss": 0.0,
6323
- "train_runtime": 0.0091,
6324
- "train_samples_per_second": 0.0,
6325
- "train_steps_per_second": 0.0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6326
  }
6327
  ],
6328
  "logging_steps": 5,
6329
- "max_steps": 0,
6330
  "num_input_tokens_seen": 0,
6331
- "num_train_epochs": 0,
6332
  "save_steps": 500,
6333
  "stateful_callbacks": {
6334
  "TrainerControl": {
@@ -6342,7 +7169,7 @@
6342
  "attributes": {}
6343
  }
6344
  },
6345
- "total_flos": 1.393076937424896e+19,
6346
  "train_batch_size": 16,
6347
  "trial_name": null,
6348
  "trial_params": null
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 1.0,
5
  "eval_steps": 500,
6
+ "global_step": 5088,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
6316
  "step": 4500
6317
  },
6318
  {
6319
+ "epoch": 0.8854166666666666,
6320
+ "grad_norm": 0.43359375,
6321
+ "learning_rate": 7.893462771773996e-06,
6322
+ "loss": 0.782,
6323
+ "step": 4505
6324
+ },
6325
+ {
6326
+ "epoch": 0.8863993710691824,
6327
+ "grad_norm": 0.337890625,
6328
+ "learning_rate": 7.760421092313152e-06,
6329
+ "loss": 0.7891,
6330
+ "step": 4510
6331
+ },
6332
+ {
6333
+ "epoch": 0.8873820754716981,
6334
+ "grad_norm": 0.326171875,
6335
+ "learning_rate": 7.628464876673202e-06,
6336
+ "loss": 0.8201,
6337
+ "step": 4515
6338
+ },
6339
+ {
6340
+ "epoch": 0.8883647798742138,
6341
+ "grad_norm": 0.3203125,
6342
+ "learning_rate": 7.497595677698388e-06,
6343
+ "loss": 0.8031,
6344
+ "step": 4520
6345
+ },
6346
+ {
6347
+ "epoch": 0.8893474842767296,
6348
+ "grad_norm": 0.32421875,
6349
+ "learning_rate": 7.3678150354410615e-06,
6350
+ "loss": 0.8013,
6351
+ "step": 4525
6352
+ },
6353
+ {
6354
+ "epoch": 0.8903301886792453,
6355
+ "grad_norm": 0.326171875,
6356
+ "learning_rate": 7.239124477143578e-06,
6357
+ "loss": 0.8075,
6358
+ "step": 4530
6359
+ },
6360
+ {
6361
+ "epoch": 0.891312893081761,
6362
+ "grad_norm": 0.310546875,
6363
+ "learning_rate": 7.111525517220308e-06,
6364
+ "loss": 0.7919,
6365
+ "step": 4535
6366
+ },
6367
+ {
6368
+ "epoch": 0.8922955974842768,
6369
+ "grad_norm": 0.298828125,
6370
+ "learning_rate": 6.985019657239867e-06,
6371
+ "loss": 0.8074,
6372
+ "step": 4540
6373
+ },
6374
+ {
6375
+ "epoch": 0.8932783018867925,
6376
+ "grad_norm": 0.314453125,
6377
+ "learning_rate": 6.859608385907379e-06,
6378
+ "loss": 0.8009,
6379
+ "step": 4545
6380
+ },
6381
+ {
6382
+ "epoch": 0.8942610062893082,
6383
+ "grad_norm": 0.310546875,
6384
+ "learning_rate": 6.735293179046975e-06,
6385
+ "loss": 0.8081,
6386
+ "step": 4550
6387
+ },
6388
+ {
6389
+ "epoch": 0.8952437106918238,
6390
+ "grad_norm": 0.3046875,
6391
+ "learning_rate": 6.612075499584458e-06,
6392
+ "loss": 0.8067,
6393
+ "step": 4555
6394
+ },
6395
+ {
6396
+ "epoch": 0.8962264150943396,
6397
+ "grad_norm": 0.3125,
6398
+ "learning_rate": 6.489956797530084e-06,
6399
+ "loss": 0.811,
6400
+ "step": 4560
6401
+ },
6402
+ {
6403
+ "epoch": 0.8972091194968553,
6404
+ "grad_norm": 0.30078125,
6405
+ "learning_rate": 6.368938509961398e-06,
6406
+ "loss": 0.7966,
6407
+ "step": 4565
6408
+ },
6409
+ {
6410
+ "epoch": 0.898191823899371,
6411
+ "grad_norm": 0.328125,
6412
+ "learning_rate": 6.2490220610065155e-06,
6413
+ "loss": 0.8123,
6414
+ "step": 4570
6415
+ },
6416
+ {
6417
+ "epoch": 0.8991745283018868,
6418
+ "grad_norm": 0.30859375,
6419
+ "learning_rate": 6.130208861827202e-06,
6420
+ "loss": 0.8045,
6421
+ "step": 4575
6422
+ },
6423
+ {
6424
+ "epoch": 0.9001572327044025,
6425
+ "grad_norm": 0.302734375,
6426
+ "learning_rate": 6.012500310602254e-06,
6427
+ "loss": 0.7923,
6428
+ "step": 4580
6429
+ },
6430
+ {
6431
+ "epoch": 0.9011399371069182,
6432
+ "grad_norm": 0.30859375,
6433
+ "learning_rate": 5.8958977925112405e-06,
6434
+ "loss": 0.7986,
6435
+ "step": 4585
6436
+ },
6437
+ {
6438
+ "epoch": 0.902122641509434,
6439
+ "grad_norm": 0.322265625,
6440
+ "learning_rate": 5.780402679717989e-06,
6441
+ "loss": 0.8166,
6442
+ "step": 4590
6443
+ },
6444
+ {
6445
+ "epoch": 0.9031053459119497,
6446
+ "grad_norm": 0.298828125,
6447
+ "learning_rate": 5.666016331354485e-06,
6448
+ "loss": 0.7845,
6449
+ "step": 4595
6450
+ },
6451
+ {
6452
+ "epoch": 0.9040880503144654,
6453
+ "grad_norm": 0.330078125,
6454
+ "learning_rate": 5.552740093505015e-06,
6455
+ "loss": 0.7865,
6456
+ "step": 4600
6457
+ },
6458
+ {
6459
+ "epoch": 0.9050707547169812,
6460
+ "grad_norm": 0.30859375,
6461
+ "learning_rate": 5.440575299190165e-06,
6462
+ "loss": 0.8243,
6463
+ "step": 4605
6464
+ },
6465
+ {
6466
+ "epoch": 0.9060534591194969,
6467
+ "grad_norm": 0.31640625,
6468
+ "learning_rate": 5.329523268351155e-06,
6469
+ "loss": 0.8041,
6470
+ "step": 4610
6471
+ },
6472
+ {
6473
+ "epoch": 0.9070361635220126,
6474
+ "grad_norm": 0.310546875,
6475
+ "learning_rate": 5.219585307834407e-06,
6476
+ "loss": 0.8057,
6477
+ "step": 4615
6478
+ },
6479
+ {
6480
+ "epoch": 0.9080188679245284,
6481
+ "grad_norm": 0.294921875,
6482
+ "learning_rate": 5.110762711376116e-06,
6483
+ "loss": 0.7987,
6484
+ "step": 4620
6485
+ },
6486
+ {
6487
+ "epoch": 0.909001572327044,
6488
+ "grad_norm": 0.326171875,
6489
+ "learning_rate": 5.003056759586944e-06,
6490
+ "loss": 0.7983,
6491
+ "step": 4625
6492
+ },
6493
+ {
6494
+ "epoch": 0.9099842767295597,
6495
+ "grad_norm": 0.3203125,
6496
+ "learning_rate": 4.89646871993703e-06,
6497
+ "loss": 0.7872,
6498
+ "step": 4630
6499
+ },
6500
+ {
6501
+ "epoch": 0.9109669811320755,
6502
+ "grad_norm": 0.31640625,
6503
+ "learning_rate": 4.79099984674114e-06,
6504
+ "loss": 0.8203,
6505
+ "step": 4635
6506
+ },
6507
+ {
6508
+ "epoch": 0.9119496855345912,
6509
+ "grad_norm": 0.318359375,
6510
+ "learning_rate": 4.6866513811437475e-06,
6511
+ "loss": 0.7816,
6512
+ "step": 4640
6513
+ },
6514
+ {
6515
+ "epoch": 0.9129323899371069,
6516
+ "grad_norm": 0.30078125,
6517
+ "learning_rate": 4.58342455110452e-06,
6518
+ "loss": 0.8151,
6519
+ "step": 4645
6520
+ },
6521
+ {
6522
+ "epoch": 0.9139150943396226,
6523
+ "grad_norm": 0.29296875,
6524
+ "learning_rate": 4.481320571383907e-06,
6525
+ "loss": 0.8052,
6526
+ "step": 4650
6527
+ },
6528
+ {
6529
+ "epoch": 0.9148977987421384,
6530
+ "grad_norm": 0.31640625,
6531
+ "learning_rate": 4.380340643528735e-06,
6532
+ "loss": 0.8069,
6533
+ "step": 4655
6534
+ },
6535
+ {
6536
+ "epoch": 0.9158805031446541,
6537
+ "grad_norm": 0.328125,
6538
+ "learning_rate": 4.280485955858171e-06,
6539
+ "loss": 0.7986,
6540
+ "step": 4660
6541
+ },
6542
+ {
6543
+ "epoch": 0.9168632075471698,
6544
+ "grad_norm": 0.310546875,
6545
+ "learning_rate": 4.181757683449694e-06,
6546
+ "loss": 0.8219,
6547
+ "step": 4665
6548
+ },
6549
+ {
6550
+ "epoch": 0.9178459119496856,
6551
+ "grad_norm": 0.30859375,
6552
+ "learning_rate": 4.084156988125231e-06,
6553
+ "loss": 0.8162,
6554
+ "step": 4670
6555
+ },
6556
+ {
6557
+ "epoch": 0.9188286163522013,
6558
+ "grad_norm": 0.314453125,
6559
+ "learning_rate": 3.987685018437581e-06,
6560
+ "loss": 0.7972,
6561
+ "step": 4675
6562
+ },
6563
+ {
6564
+ "epoch": 0.9198113207547169,
6565
+ "grad_norm": 0.30859375,
6566
+ "learning_rate": 3.892342909656776e-06,
6567
+ "loss": 0.8163,
6568
+ "step": 4680
6569
+ },
6570
+ {
6571
+ "epoch": 0.9207940251572327,
6572
+ "grad_norm": 0.310546875,
6573
+ "learning_rate": 3.798131783756853e-06,
6574
+ "loss": 0.8151,
6575
+ "step": 4685
6576
+ },
6577
+ {
6578
+ "epoch": 0.9217767295597484,
6579
+ "grad_norm": 0.310546875,
6580
+ "learning_rate": 3.7050527494025265e-06,
6581
+ "loss": 0.8023,
6582
+ "step": 4690
6583
+ },
6584
+ {
6585
+ "epoch": 0.9227594339622641,
6586
+ "grad_norm": 0.322265625,
6587
+ "learning_rate": 3.6131069019362362e-06,
6588
+ "loss": 0.8229,
6589
+ "step": 4695
6590
+ },
6591
+ {
6592
+ "epoch": 0.9237421383647799,
6593
+ "grad_norm": 0.302734375,
6594
+ "learning_rate": 3.52229532336521e-06,
6595
+ "loss": 0.7951,
6596
+ "step": 4700
6597
+ },
6598
+ {
6599
+ "epoch": 0.9247248427672956,
6600
+ "grad_norm": 0.314453125,
6601
+ "learning_rate": 3.4326190823487315e-06,
6602
+ "loss": 0.8034,
6603
+ "step": 4705
6604
+ },
6605
+ {
6606
+ "epoch": 0.9257075471698113,
6607
+ "grad_norm": 0.30859375,
6608
+ "learning_rate": 3.344079234185604e-06,
6609
+ "loss": 0.807,
6610
+ "step": 4710
6611
+ },
6612
+ {
6613
+ "epoch": 0.9266902515723271,
6614
+ "grad_norm": 0.306640625,
6615
+ "learning_rate": 3.2566768208016297e-06,
6616
+ "loss": 0.8122,
6617
+ "step": 4715
6618
+ },
6619
+ {
6620
+ "epoch": 0.9276729559748428,
6621
+ "grad_norm": 0.30859375,
6622
+ "learning_rate": 3.170412870737516e-06,
6623
+ "loss": 0.8023,
6624
+ "step": 4720
6625
+ },
6626
+ {
6627
+ "epoch": 0.9286556603773585,
6628
+ "grad_norm": 0.3046875,
6629
+ "learning_rate": 3.0852883991366322e-06,
6630
+ "loss": 0.7757,
6631
+ "step": 4725
6632
+ },
6633
+ {
6634
+ "epoch": 0.9296383647798742,
6635
+ "grad_norm": 0.306640625,
6636
+ "learning_rate": 3.0013044077330744e-06,
6637
+ "loss": 0.7709,
6638
+ "step": 4730
6639
+ },
6640
+ {
6641
+ "epoch": 0.93062106918239,
6642
+ "grad_norm": 0.322265625,
6643
+ "learning_rate": 2.9184618848399627e-06,
6644
+ "loss": 0.8331,
6645
+ "step": 4735
6646
+ },
6647
+ {
6648
+ "epoch": 0.9316037735849056,
6649
+ "grad_norm": 0.3125,
6650
+ "learning_rate": 2.836761805337762e-06,
6651
+ "loss": 0.7819,
6652
+ "step": 4740
6653
+ },
6654
+ {
6655
+ "epoch": 0.9325864779874213,
6656
+ "grad_norm": 0.33984375,
6657
+ "learning_rate": 2.756205130662737e-06,
6658
+ "loss": 0.7949,
6659
+ "step": 4745
6660
+ },
6661
+ {
6662
+ "epoch": 0.9335691823899371,
6663
+ "grad_norm": 0.31640625,
6664
+ "learning_rate": 2.6767928087957693e-06,
6665
+ "loss": 0.8147,
6666
+ "step": 4750
6667
+ },
6668
+ {
6669
+ "epoch": 0.9345518867924528,
6670
+ "grad_norm": 0.30078125,
6671
+ "learning_rate": 2.598525774251159e-06,
6672
+ "loss": 0.7786,
6673
+ "step": 4755
6674
+ },
6675
+ {
6676
+ "epoch": 0.9355345911949685,
6677
+ "grad_norm": 0.302734375,
6678
+ "learning_rate": 2.52140494806552e-06,
6679
+ "loss": 0.7954,
6680
+ "step": 4760
6681
+ },
6682
+ {
6683
+ "epoch": 0.9365172955974843,
6684
+ "grad_norm": 0.30859375,
6685
+ "learning_rate": 2.44543123778711e-06,
6686
+ "loss": 0.7851,
6687
+ "step": 4765
6688
+ },
6689
+ {
6690
+ "epoch": 0.9375,
6691
+ "grad_norm": 0.3046875,
6692
+ "learning_rate": 2.370605537465065e-06,
6693
+ "loss": 0.81,
6694
+ "step": 4770
6695
+ },
6696
+ {
6697
+ "epoch": 0.9384827044025157,
6698
+ "grad_norm": 0.302734375,
6699
+ "learning_rate": 2.296928727638814e-06,
6700
+ "loss": 0.8305,
6701
+ "step": 4775
6702
+ },
6703
+ {
6704
+ "epoch": 0.9394654088050315,
6705
+ "grad_norm": 0.3046875,
6706
+ "learning_rate": 2.2244016753278586e-06,
6707
+ "loss": 0.7896,
6708
+ "step": 4780
6709
+ },
6710
+ {
6711
+ "epoch": 0.9404481132075472,
6712
+ "grad_norm": 0.3046875,
6713
+ "learning_rate": 2.1530252340214996e-06,
6714
+ "loss": 0.8101,
6715
+ "step": 4785
6716
+ },
6717
+ {
6718
+ "epoch": 0.9414308176100629,
6719
+ "grad_norm": 0.31640625,
6720
+ "learning_rate": 2.0828002436687257e-06,
6721
+ "loss": 0.805,
6722
+ "step": 4790
6723
+ },
6724
+ {
6725
+ "epoch": 0.9424135220125787,
6726
+ "grad_norm": 0.310546875,
6727
+ "learning_rate": 2.013727530668452e-06,
6728
+ "loss": 0.804,
6729
+ "step": 4795
6730
+ },
6731
+ {
6732
+ "epoch": 0.9433962264150944,
6733
+ "grad_norm": 0.314453125,
6734
+ "learning_rate": 1.9458079078597203e-06,
6735
+ "loss": 0.825,
6736
+ "step": 4800
6737
+ },
6738
+ {
6739
+ "epoch": 0.94437893081761,
6740
+ "grad_norm": 0.3046875,
6741
+ "learning_rate": 1.8790421745121356e-06,
6742
+ "loss": 0.821,
6743
+ "step": 4805
6744
+ },
6745
+ {
6746
+ "epoch": 0.9453616352201258,
6747
+ "grad_norm": 0.310546875,
6748
+ "learning_rate": 1.813431116316522e-06,
6749
+ "loss": 0.8101,
6750
+ "step": 4810
6751
+ },
6752
+ {
6753
+ "epoch": 0.9463443396226415,
6754
+ "grad_norm": 0.30859375,
6755
+ "learning_rate": 1.748975505375583e-06,
6756
+ "loss": 0.8016,
6757
+ "step": 4815
6758
+ },
6759
+ {
6760
+ "epoch": 0.9473270440251572,
6761
+ "grad_norm": 0.296875,
6762
+ "learning_rate": 1.6856761001948772e-06,
6763
+ "loss": 0.7847,
6764
+ "step": 4820
6765
+ },
6766
+ {
6767
+ "epoch": 0.9483097484276729,
6768
+ "grad_norm": 0.3203125,
6769
+ "learning_rate": 1.6235336456739026e-06,
6770
+ "loss": 0.8007,
6771
+ "step": 4825
6772
+ },
6773
+ {
6774
+ "epoch": 0.9492924528301887,
6775
+ "grad_norm": 0.310546875,
6776
+ "learning_rate": 1.5625488730972693e-06,
6777
+ "loss": 0.7891,
6778
+ "step": 4830
6779
+ },
6780
+ {
6781
+ "epoch": 0.9502751572327044,
6782
+ "grad_norm": 0.30859375,
6783
+ "learning_rate": 1.5027225001261525e-06,
6784
+ "loss": 0.8244,
6785
+ "step": 4835
6786
+ },
6787
+ {
6788
+ "epoch": 0.9512578616352201,
6789
+ "grad_norm": 0.298828125,
6790
+ "learning_rate": 1.4440552307898202e-06,
6791
+ "loss": 0.7962,
6792
+ "step": 4840
6793
+ },
6794
+ {
6795
+ "epoch": 0.9522405660377359,
6796
+ "grad_norm": 0.306640625,
6797
+ "learning_rate": 1.386547755477363e-06,
6798
+ "loss": 0.7982,
6799
+ "step": 4845
6800
+ },
6801
+ {
6802
+ "epoch": 0.9532232704402516,
6803
+ "grad_norm": 0.318359375,
6804
+ "learning_rate": 1.3302007509295445e-06,
6805
+ "loss": 0.7896,
6806
+ "step": 4850
6807
+ },
6808
+ {
6809
+ "epoch": 0.9542059748427673,
6810
+ "grad_norm": 0.310546875,
6811
+ "learning_rate": 1.2750148802308737e-06,
6812
+ "loss": 0.8158,
6813
+ "step": 4855
6814
+ },
6815
+ {
6816
+ "epoch": 0.9551886792452831,
6817
+ "grad_norm": 0.3125,
6818
+ "learning_rate": 1.2209907928017795e-06,
6819
+ "loss": 0.8012,
6820
+ "step": 4860
6821
+ },
6822
+ {
6823
+ "epoch": 0.9561713836477987,
6824
+ "grad_norm": 0.310546875,
6825
+ "learning_rate": 1.1681291243909153e-06,
6826
+ "loss": 0.8146,
6827
+ "step": 4865
6828
+ },
6829
+ {
6830
+ "epoch": 0.9571540880503144,
6831
+ "grad_norm": 0.330078125,
6832
+ "learning_rate": 1.116430497067833e-06,
6833
+ "loss": 0.8175,
6834
+ "step": 4870
6835
+ },
6836
+ {
6837
+ "epoch": 0.9581367924528302,
6838
+ "grad_norm": 0.3125,
6839
+ "learning_rate": 1.0658955192154763e-06,
6840
+ "loss": 0.7937,
6841
+ "step": 4875
6842
+ },
6843
+ {
6844
+ "epoch": 0.9591194968553459,
6845
+ "grad_norm": 0.3125,
6846
+ "learning_rate": 1.0165247855231542e-06,
6847
+ "loss": 0.8,
6848
+ "step": 4880
6849
+ },
6850
+ {
6851
+ "epoch": 0.9601022012578616,
6852
+ "grad_norm": 0.314453125,
6853
+ "learning_rate": 9.683188769794792e-07,
6854
+ "loss": 0.8042,
6855
+ "step": 4885
6856
+ },
6857
+ {
6858
+ "epoch": 0.9610849056603774,
6859
+ "grad_norm": 0.298828125,
6860
+ "learning_rate": 9.212783608655518e-07,
6861
+ "loss": 0.8078,
6862
+ "step": 4890
6863
+ },
6864
+ {
6865
+ "epoch": 0.9620676100628931,
6866
+ "grad_norm": 0.31640625,
6867
+ "learning_rate": 8.754037907482748e-07,
6868
+ "loss": 0.7992,
6869
+ "step": 4895
6870
+ },
6871
+ {
6872
+ "epoch": 0.9630503144654088,
6873
+ "grad_norm": 0.306640625,
6874
+ "learning_rate": 8.306957064738385e-07,
6875
+ "loss": 0.806,
6876
+ "step": 4900
6877
+ },
6878
+ {
6879
+ "epoch": 0.9640330188679245,
6880
+ "grad_norm": 0.31640625,
6881
+ "learning_rate": 7.871546341614023e-07,
6882
+ "loss": 0.7803,
6883
+ "step": 4905
6884
+ },
6885
+ {
6886
+ "epoch": 0.9650157232704403,
6887
+ "grad_norm": 0.3046875,
6888
+ "learning_rate": 7.447810861968552e-07,
6889
+ "loss": 0.7864,
6890
+ "step": 4910
6891
+ },
6892
+ {
6893
+ "epoch": 0.965998427672956,
6894
+ "grad_norm": 0.30859375,
6895
+ "learning_rate": 7.03575561226788e-07,
6896
+ "loss": 0.7837,
6897
+ "step": 4915
6898
+ },
6899
+ {
6900
+ "epoch": 0.9669811320754716,
6901
+ "grad_norm": 0.302734375,
6902
+ "learning_rate": 6.635385441526754e-07,
6903
+ "loss": 0.7935,
6904
+ "step": 4920
6905
+ },
6906
+ {
6907
+ "epoch": 0.9679638364779874,
6908
+ "grad_norm": 0.314453125,
6909
+ "learning_rate": 6.246705061251245e-07,
6910
+ "loss": 0.8074,
6911
+ "step": 4925
6912
+ },
6913
+ {
6914
+ "epoch": 0.9689465408805031,
6915
+ "grad_norm": 0.298828125,
6916
+ "learning_rate": 5.86971904538347e-07,
6917
+ "loss": 0.8082,
6918
+ "step": 4930
6919
+ },
6920
+ {
6921
+ "epoch": 0.9699292452830188,
6922
+ "grad_norm": 0.3125,
6923
+ "learning_rate": 5.504431830247514e-07,
6924
+ "loss": 0.7889,
6925
+ "step": 4935
6926
+ },
6927
+ {
6928
+ "epoch": 0.9709119496855346,
6929
+ "grad_norm": 0.306640625,
6930
+ "learning_rate": 5.150847714497697e-07,
6931
+ "loss": 0.7924,
6932
+ "step": 4940
6933
+ },
6934
+ {
6935
+ "epoch": 0.9718946540880503,
6936
+ "grad_norm": 0.296875,
6937
+ "learning_rate": 4.80897085906773e-07,
6938
+ "loss": 0.81,
6939
+ "step": 4945
6940
+ },
6941
+ {
6942
+ "epoch": 0.972877358490566,
6943
+ "grad_norm": 0.294921875,
6944
+ "learning_rate": 4.4788052871215234e-07,
6945
+ "loss": 0.805,
6946
+ "step": 4950
6947
+ },
6948
+ {
6949
+ "epoch": 0.9738600628930818,
6950
+ "grad_norm": 0.30078125,
6951
+ "learning_rate": 4.1603548840062345e-07,
6952
+ "loss": 0.8101,
6953
+ "step": 4955
6954
+ },
6955
+ {
6956
+ "epoch": 0.9748427672955975,
6957
+ "grad_norm": 0.3046875,
6958
+ "learning_rate": 3.853623397206407e-07,
6959
+ "loss": 0.7909,
6960
+ "step": 4960
6961
+ },
6962
+ {
6963
+ "epoch": 0.9758254716981132,
6964
+ "grad_norm": 0.302734375,
6965
+ "learning_rate": 3.5586144362997896e-07,
6966
+ "loss": 0.7972,
6967
+ "step": 4965
6968
+ },
6969
+ {
6970
+ "epoch": 0.976808176100629,
6971
+ "grad_norm": 0.314453125,
6972
+ "learning_rate": 3.275331472914922e-07,
6973
+ "loss": 0.8101,
6974
+ "step": 4970
6975
+ },
6976
+ {
6977
+ "epoch": 0.9777908805031447,
6978
+ "grad_norm": 0.3125,
6979
+ "learning_rate": 3.0037778406902805e-07,
6980
+ "loss": 0.8184,
6981
+ "step": 4975
6982
+ },
6983
+ {
6984
+ "epoch": 0.9787735849056604,
6985
+ "grad_norm": 0.3125,
6986
+ "learning_rate": 2.743956735234865e-07,
6987
+ "loss": 0.782,
6988
+ "step": 4980
6989
+ },
6990
+ {
6991
+ "epoch": 0.9797562893081762,
6992
+ "grad_norm": 0.322265625,
6993
+ "learning_rate": 2.4958712140911166e-07,
6994
+ "loss": 0.7905,
6995
+ "step": 4985
6996
+ },
6997
+ {
6998
+ "epoch": 0.9807389937106918,
6999
+ "grad_norm": 0.310546875,
7000
+ "learning_rate": 2.2595241966982817e-07,
7001
+ "loss": 0.8163,
7002
+ "step": 4990
7003
+ },
7004
+ {
7005
+ "epoch": 0.9817216981132075,
7006
+ "grad_norm": 0.3125,
7007
+ "learning_rate": 2.0349184643586595e-07,
7008
+ "loss": 0.8266,
7009
+ "step": 4995
7010
+ },
7011
+ {
7012
+ "epoch": 0.9827044025157232,
7013
+ "grad_norm": 0.30859375,
7014
+ "learning_rate": 1.8220566602040745e-07,
7015
+ "loss": 0.8174,
7016
+ "step": 5000
7017
+ },
7018
+ {
7019
+ "epoch": 0.983687106918239,
7020
+ "grad_norm": 0.302734375,
7021
+ "learning_rate": 1.6209412891659003e-07,
7022
+ "loss": 0.8052,
7023
+ "step": 5005
7024
+ },
7025
+ {
7026
+ "epoch": 0.9846698113207547,
7027
+ "grad_norm": 0.302734375,
7028
+ "learning_rate": 1.4315747179446392e-07,
7029
+ "loss": 0.7871,
7030
+ "step": 5010
7031
+ },
7032
+ {
7033
+ "epoch": 0.9856525157232704,
7034
+ "grad_norm": 0.31640625,
7035
+ "learning_rate": 1.2539591749821666e-07,
7036
+ "loss": 0.7973,
7037
+ "step": 5015
7038
+ },
7039
+ {
7040
+ "epoch": 0.9866352201257862,
7041
+ "grad_norm": 0.33203125,
7042
+ "learning_rate": 1.088096750436085e-07,
7043
+ "loss": 0.7972,
7044
+ "step": 5020
7045
+ },
7046
+ {
7047
+ "epoch": 0.9876179245283019,
7048
+ "grad_norm": 0.31640625,
7049
+ "learning_rate": 9.339893961548551e-08,
7050
+ "loss": 0.8152,
7051
+ "step": 5025
7052
+ },
7053
+ {
7054
+ "epoch": 0.9886006289308176,
7055
+ "grad_norm": 0.310546875,
7056
+ "learning_rate": 7.916389256541479e-08,
7057
+ "loss": 0.8147,
7058
+ "step": 5030
7059
+ },
7060
+ {
7061
+ "epoch": 0.9895833333333334,
7062
+ "grad_norm": 0.302734375,
7063
+ "learning_rate": 6.610470140967495e-08,
7064
+ "loss": 0.81,
7065
+ "step": 5035
7066
+ },
7067
+ {
7068
+ "epoch": 0.9905660377358491,
7069
+ "grad_norm": 0.310546875,
7070
+ "learning_rate": 5.422151982719115e-08,
7071
+ "loss": 0.8167,
7072
+ "step": 5040
7073
+ },
7074
+ {
7075
+ "epoch": 0.9915487421383647,
7076
+ "grad_norm": 0.330078125,
7077
+ "learning_rate": 4.351448765775867e-08,
7078
+ "loss": 0.8175,
7079
+ "step": 5045
7080
+ },
7081
+ {
7082
+ "epoch": 0.9925314465408805,
7083
+ "grad_norm": 0.310546875,
7084
+ "learning_rate": 3.3983730900377655e-08,
7085
+ "loss": 0.8009,
7086
+ "step": 5050
7087
+ },
7088
+ {
7089
+ "epoch": 0.9935141509433962,
7090
+ "grad_norm": 0.30078125,
7091
+ "learning_rate": 2.5629361711809742e-08,
7092
+ "loss": 0.8025,
7093
+ "step": 5055
7094
+ },
7095
+ {
7096
+ "epoch": 0.9944968553459119,
7097
+ "grad_norm": 0.30078125,
7098
+ "learning_rate": 1.8451478405223653e-08,
7099
+ "loss": 0.7953,
7100
+ "step": 5060
7101
+ },
7102
+ {
7103
+ "epoch": 0.9954795597484277,
7104
+ "grad_norm": 0.314453125,
7105
+ "learning_rate": 1.2450165449062744e-08,
7106
+ "loss": 0.7893,
7107
+ "step": 5065
7108
+ },
7109
+ {
7110
+ "epoch": 0.9964622641509434,
7111
+ "grad_norm": 0.3046875,
7112
+ "learning_rate": 7.62549346601249e-09,
7113
+ "loss": 0.8113,
7114
+ "step": 5070
7115
+ },
7116
+ {
7117
+ "epoch": 0.9974449685534591,
7118
+ "grad_norm": 0.302734375,
7119
+ "learning_rate": 3.977519232223337e-09,
7120
+ "loss": 0.8174,
7121
+ "step": 5075
7122
+ },
7123
+ {
7124
+ "epoch": 0.9984276729559748,
7125
+ "grad_norm": 0.302734375,
7126
+ "learning_rate": 1.5062856765779565e-09,
7127
+ "loss": 0.8089,
7128
+ "step": 5080
7129
+ },
7130
+ {
7131
+ "epoch": 0.9994103773584906,
7132
+ "grad_norm": 0.3125,
7133
+ "learning_rate": 2.118218802582561e-10,
7134
+ "loss": 0.8288,
7135
+ "step": 5085
7136
+ },
7137
+ {
7138
+ "epoch": 1.0,
7139
+ "eval_loss": 0.7983009815216064,
7140
+ "eval_runtime": 7962.7938,
7141
+ "eval_samples_per_second": 10.22,
7142
+ "eval_steps_per_second": 0.16,
7143
+ "step": 5088
7144
+ },
7145
+ {
7146
+ "epoch": 1.0,
7147
+ "step": 5088,
7148
+ "total_flos": 1.5751056572484157e+19,
7149
+ "train_loss": 0.09284130807192821,
7150
+ "train_runtime": 20560.1048,
7151
+ "train_samples_per_second": 15.837,
7152
+ "train_steps_per_second": 0.247
7153
  }
7154
  ],
7155
  "logging_steps": 5,
7156
+ "max_steps": 5088,
7157
  "num_input_tokens_seen": 0,
7158
+ "num_train_epochs": 1,
7159
  "save_steps": 500,
7160
  "stateful_callbacks": {
7161
  "TrainerControl": {
 
7169
  "attributes": {}
7170
  }
7171
  },
7172
+ "total_flos": 1.5751056572484157e+19,
7173
  "train_batch_size": 16,
7174
  "trial_name": null,
7175
  "trial_params": null