AAA01101312 commited on
Commit
2351dee
1 Parent(s): 89a42ef

Training in progress, step 2500

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. model.safetensors +1 -1
  2. run-0/checkpoint-1000/model.safetensors +1 -1
  3. run-0/checkpoint-1000/optimizer.pt +1 -1
  4. run-0/checkpoint-1000/scheduler.pt +1 -1
  5. run-0/checkpoint-1000/trainer_state.json +27 -27
  6. run-0/checkpoint-1000/training_args.bin +1 -1
  7. run-0/checkpoint-1500/model.safetensors +1 -1
  8. run-0/checkpoint-1500/optimizer.pt +1 -1
  9. run-0/checkpoint-1500/scheduler.pt +1 -1
  10. run-0/checkpoint-1500/trainer_state.json +35 -35
  11. run-0/checkpoint-1500/training_args.bin +1 -1
  12. run-0/checkpoint-500/model.safetensors +1 -1
  13. run-0/checkpoint-500/optimizer.pt +1 -1
  14. run-0/checkpoint-500/scheduler.pt +1 -1
  15. run-0/checkpoint-500/trainer_state.json +14 -14
  16. run-0/checkpoint-500/training_args.bin +1 -1
  17. run-1/checkpoint-1000/model.safetensors +1 -1
  18. run-1/checkpoint-1000/optimizer.pt +1 -1
  19. run-1/checkpoint-1000/scheduler.pt +1 -1
  20. run-1/checkpoint-1000/trainer_state.json +26 -26
  21. run-1/checkpoint-1000/training_args.bin +1 -1
  22. run-1/checkpoint-1500/model.safetensors +1 -1
  23. run-1/checkpoint-1500/optimizer.pt +1 -1
  24. run-1/checkpoint-1500/scheduler.pt +1 -1
  25. run-1/checkpoint-1500/trainer_state.json +34 -34
  26. run-1/checkpoint-1500/training_args.bin +1 -1
  27. run-1/checkpoint-2000/model.safetensors +1 -1
  28. run-1/checkpoint-2000/optimizer.pt +1 -1
  29. run-1/checkpoint-2000/scheduler.pt +1 -1
  30. run-1/checkpoint-2000/trainer_state.json +47 -47
  31. run-1/checkpoint-2000/training_args.bin +1 -1
  32. run-1/checkpoint-500/model.safetensors +1 -1
  33. run-1/checkpoint-500/optimizer.pt +1 -1
  34. run-1/checkpoint-500/scheduler.pt +1 -1
  35. run-1/checkpoint-500/trainer_state.json +13 -13
  36. run-1/checkpoint-500/training_args.bin +1 -1
  37. run-2/checkpoint-1000/model.safetensors +1 -1
  38. run-2/checkpoint-1000/optimizer.pt +1 -1
  39. run-2/checkpoint-1000/scheduler.pt +1 -1
  40. run-2/checkpoint-1000/trainer_state.json +26 -26
  41. run-2/checkpoint-1000/training_args.bin +1 -1
  42. run-2/checkpoint-1500/model.safetensors +1 -1
  43. run-2/checkpoint-1500/optimizer.pt +1 -1
  44. run-2/checkpoint-1500/scheduler.pt +1 -1
  45. run-2/checkpoint-1500/trainer_state.json +34 -34
  46. run-2/checkpoint-1500/training_args.bin +1 -1
  47. run-2/checkpoint-2000/model.safetensors +1 -1
  48. run-2/checkpoint-2000/optimizer.pt +1 -1
  49. run-2/checkpoint-2000/scheduler.pt +1 -1
  50. run-2/checkpoint-2000/trainer_state.json +47 -47
model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:0d2f004909c4c1c00205dfbb58cc5b713177ceabf4fd16c69d18341822949b30
3
  size 268290900
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:78016a20790cea0a2ba4e424321e0f9b1c92dba6f42f3058778c503ac0b8e500
3
  size 268290900
run-0/checkpoint-1000/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:de945d0f6febbd9b5ecd77d9bc33469592a981879cc654501fb0ff20d21f5745
3
  size 268290900
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fcdfe9a6c39ec2298ff882406bfc249659761f4552c8ab3392cfd68269c4ee22
3
  size 268290900
run-0/checkpoint-1000/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:6698b6cec0b6b83f21b51c1d31c8d67c57acbc5591039ba24a1cbeb0c2b1c052
3
  size 536643898
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:37e2f2b450d6975ded453dd9ba20a24ede194f9df45c36ed9c5a4048276a5e29
3
  size 536643898
run-0/checkpoint-1000/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:ab24af5732cdd2b225edfa7b748cb6638d90aa8efee657214a80de4c7f962c49
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d435f08c91fbb25ecefa6816f8694ea265b07f7526f27bca41c28b9ad50fad06
3
  size 1064
run-0/checkpoint-1000/trainer_state.json CHANGED
@@ -10,57 +10,57 @@
10
  "log_history": [
11
  {
12
  "epoch": 1.0,
13
- "eval_accuracy": 0.5819354838709677,
14
- "eval_loss": 0.19125010073184967,
15
- "eval_runtime": 14.8029,
16
- "eval_samples_per_second": 209.418,
17
- "eval_steps_per_second": 26.211,
18
  "step": 318
19
  },
20
  {
21
  "epoch": 1.5723270440251573,
22
- "grad_norm": 0.5311678647994995,
23
- "learning_rate": 1.650593990216632e-05,
24
- "loss": 0.3069,
25
  "step": 500
26
  },
27
  {
28
  "epoch": 2.0,
29
- "eval_accuracy": 0.8219354838709677,
30
- "eval_loss": 0.0939890518784523,
31
- "eval_runtime": 17.4359,
32
- "eval_samples_per_second": 177.795,
33
- "eval_steps_per_second": 22.253,
34
  "step": 636
35
  },
36
  {
37
  "epoch": 3.0,
38
- "eval_accuracy": 0.8845161290322581,
39
- "eval_loss": 0.062438562512397766,
40
- "eval_runtime": 16.7447,
41
- "eval_samples_per_second": 185.134,
42
- "eval_steps_per_second": 23.172,
43
  "step": 954
44
  },
45
  {
46
  "epoch": 3.1446540880503147,
47
- "grad_norm": 0.48808130621910095,
48
- "learning_rate": 1.3011879804332637e-05,
49
- "loss": 0.1082,
50
  "step": 1000
51
  }
52
  ],
53
  "logging_steps": 500,
54
- "max_steps": 2862,
55
  "num_input_tokens_seen": 0,
56
- "num_train_epochs": 9,
57
  "save_steps": 500,
58
- "total_flos": 260941334653608.0,
59
  "train_batch_size": 48,
60
  "trial_name": null,
61
  "trial_params": {
62
- "alpha": 0.273786553856763,
63
- "num_train_epochs": 9,
64
- "temperature": 19
65
  }
66
  }
 
10
  "log_history": [
11
  {
12
  "epoch": 1.0,
13
+ "eval_accuracy": 0.5893548387096774,
14
+ "eval_loss": 0.21903084218502045,
15
+ "eval_runtime": 14.3987,
16
+ "eval_samples_per_second": 215.297,
17
+ "eval_steps_per_second": 26.947,
18
  "step": 318
19
  },
20
  {
21
  "epoch": 1.5723270440251573,
22
+ "grad_norm": 0.5576276779174805,
23
+ "learning_rate": 1.4758909853249476e-05,
24
+ "loss": 0.3448,
25
  "step": 500
26
  },
27
  {
28
  "epoch": 2.0,
29
+ "eval_accuracy": 0.8183870967741935,
30
+ "eval_loss": 0.10806020349264145,
31
+ "eval_runtime": 15.1708,
32
+ "eval_samples_per_second": 204.339,
33
+ "eval_steps_per_second": 25.575,
34
  "step": 636
35
  },
36
  {
37
  "epoch": 3.0,
38
+ "eval_accuracy": 0.8777419354838709,
39
+ "eval_loss": 0.07312986254692078,
40
+ "eval_runtime": 15.1972,
41
+ "eval_samples_per_second": 203.985,
42
+ "eval_steps_per_second": 25.531,
43
  "step": 954
44
  },
45
  {
46
  "epoch": 3.1446540880503147,
47
+ "grad_norm": 0.5403371453285217,
48
+ "learning_rate": 9.517819706498952e-06,
49
+ "loss": 0.1247,
50
  "step": 1000
51
  }
52
  ],
53
  "logging_steps": 500,
54
+ "max_steps": 1908,
55
  "num_input_tokens_seen": 0,
56
+ "num_train_epochs": 6,
57
  "save_steps": 500,
58
+ "total_flos": 308320501960968.0,
59
  "train_batch_size": 48,
60
  "trial_name": null,
61
  "trial_params": {
62
+ "alpha": 0.04291496094703673,
63
+ "num_train_epochs": 6,
64
+ "temperature": 7
65
  }
66
  }
run-0/checkpoint-1000/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:637e9b5b80c6655b5c46a551d36eed90807960f307623b1b6dbdd399e91402a8
3
  size 5048
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e6d301d3792da68a32e564cb44e96306a109e871eaaeebe23fb207a1f20fef33
3
  size 5048
run-0/checkpoint-1500/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:2de11ae24bdc45ae8f21fdb52ce7574e052dd18f7024b89577650ccb74ae044d
3
  size 268290900
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d1e69f0703aa726ed38410f75abe7aa65b71338a02f00cd30eb48e9e28ae82cc
3
  size 268290900
run-0/checkpoint-1500/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:722b7024514af073ce3492b0638b222f1cf0033d4420e2d2d75ff3ccf4102d38
3
  size 536643898
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6de24fd3dc67d7467a96bfbf4a84f84db15a52ecbb6a6b663db5eb95e5a246fe
3
  size 536643898
run-0/checkpoint-1500/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d64c2fcc199bcaa9f6cc6a50071234f29de10ac43b1d2efd9b3a1263b2bd2351
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b2d0d96760a1cdcbc417a57dd4ff944b6ece136ebbdfecf57b1e511053d5ab0b
3
  size 1064
run-0/checkpoint-1500/trainer_state.json CHANGED
@@ -10,73 +10,73 @@
10
  "log_history": [
11
  {
12
  "epoch": 1.0,
13
- "eval_accuracy": 0.5819354838709677,
14
- "eval_loss": 0.19125010073184967,
15
- "eval_runtime": 14.8029,
16
- "eval_samples_per_second": 209.418,
17
- "eval_steps_per_second": 26.211,
18
  "step": 318
19
  },
20
  {
21
  "epoch": 1.5723270440251573,
22
- "grad_norm": 0.5311678647994995,
23
- "learning_rate": 1.650593990216632e-05,
24
- "loss": 0.3069,
25
  "step": 500
26
  },
27
  {
28
  "epoch": 2.0,
29
- "eval_accuracy": 0.8219354838709677,
30
- "eval_loss": 0.0939890518784523,
31
- "eval_runtime": 17.4359,
32
- "eval_samples_per_second": 177.795,
33
- "eval_steps_per_second": 22.253,
34
  "step": 636
35
  },
36
  {
37
  "epoch": 3.0,
38
- "eval_accuracy": 0.8845161290322581,
39
- "eval_loss": 0.062438562512397766,
40
- "eval_runtime": 16.7447,
41
- "eval_samples_per_second": 185.134,
42
- "eval_steps_per_second": 23.172,
43
  "step": 954
44
  },
45
  {
46
  "epoch": 3.1446540880503147,
47
- "grad_norm": 0.48808130621910095,
48
- "learning_rate": 1.3011879804332637e-05,
49
- "loss": 0.1082,
50
  "step": 1000
51
  },
52
  {
53
  "epoch": 4.0,
54
- "eval_accuracy": 0.9,
55
- "eval_loss": 0.048399314284324646,
56
- "eval_runtime": 17.8759,
57
- "eval_samples_per_second": 173.418,
58
- "eval_steps_per_second": 21.705,
59
  "step": 1272
60
  },
61
  {
62
  "epoch": 4.716981132075472,
63
- "grad_norm": 0.41808268427848816,
64
- "learning_rate": 9.517819706498952e-06,
65
- "loss": 0.0696,
66
  "step": 1500
67
  }
68
  ],
69
  "logging_steps": 500,
70
- "max_steps": 2862,
71
  "num_input_tokens_seen": 0,
72
- "num_train_epochs": 9,
73
  "save_steps": 500,
74
- "total_flos": 391368939443328.0,
75
  "train_batch_size": 48,
76
  "trial_name": null,
77
  "trial_params": {
78
- "alpha": 0.273786553856763,
79
- "num_train_epochs": 9,
80
- "temperature": 19
81
  }
82
  }
 
10
  "log_history": [
11
  {
12
  "epoch": 1.0,
13
+ "eval_accuracy": 0.5893548387096774,
14
+ "eval_loss": 0.21903084218502045,
15
+ "eval_runtime": 14.3987,
16
+ "eval_samples_per_second": 215.297,
17
+ "eval_steps_per_second": 26.947,
18
  "step": 318
19
  },
20
  {
21
  "epoch": 1.5723270440251573,
22
+ "grad_norm": 0.5576276779174805,
23
+ "learning_rate": 1.4758909853249476e-05,
24
+ "loss": 0.3448,
25
  "step": 500
26
  },
27
  {
28
  "epoch": 2.0,
29
+ "eval_accuracy": 0.8183870967741935,
30
+ "eval_loss": 0.10806020349264145,
31
+ "eval_runtime": 15.1708,
32
+ "eval_samples_per_second": 204.339,
33
+ "eval_steps_per_second": 25.575,
34
  "step": 636
35
  },
36
  {
37
  "epoch": 3.0,
38
+ "eval_accuracy": 0.8777419354838709,
39
+ "eval_loss": 0.07312986254692078,
40
+ "eval_runtime": 15.1972,
41
+ "eval_samples_per_second": 203.985,
42
+ "eval_steps_per_second": 25.531,
43
  "step": 954
44
  },
45
  {
46
  "epoch": 3.1446540880503147,
47
+ "grad_norm": 0.5403371453285217,
48
+ "learning_rate": 9.517819706498952e-06,
49
+ "loss": 0.1247,
50
  "step": 1000
51
  },
52
  {
53
  "epoch": 4.0,
54
+ "eval_accuracy": 0.8941935483870967,
55
+ "eval_loss": 0.05839391052722931,
56
+ "eval_runtime": 15.6617,
57
+ "eval_samples_per_second": 197.935,
58
+ "eval_steps_per_second": 24.774,
59
  "step": 1272
60
  },
61
  {
62
  "epoch": 4.716981132075472,
63
+ "grad_norm": 0.4541510045528412,
64
+ "learning_rate": 4.276729559748428e-06,
65
+ "loss": 0.0836,
66
  "step": 1500
67
  }
68
  ],
69
  "logging_steps": 500,
70
+ "max_steps": 1908,
71
  "num_input_tokens_seen": 0,
72
+ "num_train_epochs": 6,
73
  "save_steps": 500,
74
+ "total_flos": 438748106750688.0,
75
  "train_batch_size": 48,
76
  "trial_name": null,
77
  "trial_params": {
78
+ "alpha": 0.04291496094703673,
79
+ "num_train_epochs": 6,
80
+ "temperature": 7
81
  }
82
  }
run-0/checkpoint-1500/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:637e9b5b80c6655b5c46a551d36eed90807960f307623b1b6dbdd399e91402a8
3
  size 5048
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e6d301d3792da68a32e564cb44e96306a109e871eaaeebe23fb207a1f20fef33
3
  size 5048
run-0/checkpoint-500/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b6ae4ac93f1ae4407f44ac51f0c3e3f713346d9faa77269f5d1ea84ee4968f81
3
  size 268290900
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:28c546392df65d1dbaa8041f493b566777db4ef4ec5235ed2f1dfa2724cc15c9
3
  size 268290900
run-0/checkpoint-500/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:5e346175f2afbf9ea5efbeeee21f4d3212071ef576f9314214cce9be3434195f
3
  size 536643898
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:90c2485ca187e7693a3dd9cb86988e00019c2099b440b0a466a0c63d4b195d87
3
  size 536643898
run-0/checkpoint-500/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:0d593938cfde58f8053b2c2237865d46847911e8413848d69b02d7ad5d8c4729
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:40a61c60e2a1f354f14fb973803108ea16a9c8c66946147c407c26a5211c4f3d
3
  size 1064
run-0/checkpoint-500/trainer_state.json CHANGED
@@ -10,32 +10,32 @@
10
  "log_history": [
11
  {
12
  "epoch": 1.0,
13
- "eval_accuracy": 0.5819354838709677,
14
- "eval_loss": 0.19125010073184967,
15
- "eval_runtime": 14.8029,
16
- "eval_samples_per_second": 209.418,
17
- "eval_steps_per_second": 26.211,
18
  "step": 318
19
  },
20
  {
21
  "epoch": 1.5723270440251573,
22
- "grad_norm": 0.5311678647994995,
23
- "learning_rate": 1.650593990216632e-05,
24
- "loss": 0.3069,
25
  "step": 500
26
  }
27
  ],
28
  "logging_steps": 500,
29
- "max_steps": 2862,
30
  "num_input_tokens_seen": 0,
31
- "num_train_epochs": 9,
32
  "save_steps": 500,
33
- "total_flos": 130072209152340.0,
34
  "train_batch_size": 48,
35
  "trial_name": null,
36
  "trial_params": {
37
- "alpha": 0.273786553856763,
38
- "num_train_epochs": 9,
39
- "temperature": 19
40
  }
41
  }
 
10
  "log_history": [
11
  {
12
  "epoch": 1.0,
13
+ "eval_accuracy": 0.5893548387096774,
14
+ "eval_loss": 0.21903084218502045,
15
+ "eval_runtime": 14.3987,
16
+ "eval_samples_per_second": 215.297,
17
+ "eval_steps_per_second": 26.947,
18
  "step": 318
19
  },
20
  {
21
  "epoch": 1.5723270440251573,
22
+ "grad_norm": 0.5576276779174805,
23
+ "learning_rate": 1.4758909853249476e-05,
24
+ "loss": 0.3448,
25
  "step": 500
26
  }
27
  ],
28
  "logging_steps": 500,
29
+ "max_steps": 1908,
30
  "num_input_tokens_seen": 0,
31
+ "num_train_epochs": 6,
32
  "save_steps": 500,
33
+ "total_flos": 177451376459700.0,
34
  "train_batch_size": 48,
35
  "trial_name": null,
36
  "trial_params": {
37
+ "alpha": 0.04291496094703673,
38
+ "num_train_epochs": 6,
39
+ "temperature": 7
40
  }
41
  }
run-0/checkpoint-500/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:637e9b5b80c6655b5c46a551d36eed90807960f307623b1b6dbdd399e91402a8
3
  size 5048
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e6d301d3792da68a32e564cb44e96306a109e871eaaeebe23fb207a1f20fef33
3
  size 5048
run-1/checkpoint-1000/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c6e90877b0095c8214551a990e216704b0b2b8e9ba7ffc3e48acfe9cb00e4055
3
  size 268290900
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:df0c1e7f0c76c2d33cfb8471753b518ac63cee9c321b2f9ff003512f3569f5dd
3
  size 268290900
run-1/checkpoint-1000/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:5bc776218a8d9b30431f9aed8b79964b4e342469a8271510efe557838d22806a
3
  size 536643898
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:29f4f538b0136a3857eeec3f711e1e7b22fb6659feb942992c6f42aae30b096a
3
  size 536643898
run-1/checkpoint-1000/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:ab24af5732cdd2b225edfa7b748cb6638d90aa8efee657214a80de4c7f962c49
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:690dc36074a76886ef61aa14bba7f4a22546d075686d785f211a5a8037fc50f8
3
  size 1064
run-1/checkpoint-1000/trainer_state.json CHANGED
@@ -10,57 +10,57 @@
10
  "log_history": [
11
  {
12
  "epoch": 1.0,
13
- "eval_accuracy": 0.5835483870967741,
14
- "eval_loss": 0.1925622820854187,
15
- "eval_runtime": 14.4107,
16
- "eval_samples_per_second": 215.117,
17
- "eval_steps_per_second": 26.924,
18
  "step": 318
19
  },
20
  {
21
  "epoch": 1.5723270440251573,
22
- "grad_norm": 0.5316773056983948,
23
- "learning_rate": 1.650593990216632e-05,
24
- "loss": 0.3086,
25
  "step": 500
26
  },
27
  {
28
  "epoch": 2.0,
29
- "eval_accuracy": 0.8203225806451613,
30
- "eval_loss": 0.09446250647306442,
31
- "eval_runtime": 15.4923,
32
- "eval_samples_per_second": 200.1,
33
- "eval_steps_per_second": 25.045,
34
  "step": 636
35
  },
36
  {
37
  "epoch": 3.0,
38
- "eval_accuracy": 0.8858064516129033,
39
- "eval_loss": 0.06262949109077454,
40
- "eval_runtime": 14.8271,
41
- "eval_samples_per_second": 209.076,
42
- "eval_steps_per_second": 26.168,
43
  "step": 954
44
  },
45
  {
46
  "epoch": 3.1446540880503147,
47
- "grad_norm": 0.4893101155757904,
48
- "learning_rate": 1.3011879804332637e-05,
49
- "loss": 0.1087,
50
  "step": 1000
51
  }
52
  ],
53
  "logging_steps": 500,
54
- "max_steps": 2862,
55
  "num_input_tokens_seen": 0,
56
- "num_train_epochs": 9,
57
  "save_steps": 500,
58
  "total_flos": 260941334653608.0,
59
  "train_batch_size": 48,
60
  "trial_name": null,
61
  "trial_params": {
62
- "alpha": 0.8057539670749069,
63
- "num_train_epochs": 9,
64
- "temperature": 17
65
  }
66
  }
 
10
  "log_history": [
11
  {
12
  "epoch": 1.0,
13
+ "eval_accuracy": 0.5832258064516129,
14
+ "eval_loss": 0.20071792602539062,
15
+ "eval_runtime": 13.9879,
16
+ "eval_samples_per_second": 221.619,
17
+ "eval_steps_per_second": 27.738,
18
  "step": 318
19
  },
20
  {
21
  "epoch": 1.5723270440251573,
22
+ "grad_norm": 0.5363588929176331,
23
+ "learning_rate": 1.550763701707098e-05,
24
+ "loss": 0.319,
25
  "step": 500
26
  },
27
  {
28
  "epoch": 2.0,
29
+ "eval_accuracy": 0.817741935483871,
30
+ "eval_loss": 0.0997093915939331,
31
+ "eval_runtime": 14.8543,
32
+ "eval_samples_per_second": 208.694,
33
+ "eval_steps_per_second": 26.12,
34
  "step": 636
35
  },
36
  {
37
  "epoch": 3.0,
38
+ "eval_accuracy": 0.8796774193548387,
39
+ "eval_loss": 0.06741480529308319,
40
+ "eval_runtime": 14.8004,
41
+ "eval_samples_per_second": 209.454,
42
+ "eval_steps_per_second": 26.216,
43
  "step": 954
44
  },
45
  {
46
  "epoch": 3.1446540880503147,
47
+ "grad_norm": 0.5110210180282593,
48
+ "learning_rate": 1.101527403414196e-05,
49
+ "loss": 0.1149,
50
  "step": 1000
51
  }
52
  ],
53
  "logging_steps": 500,
54
+ "max_steps": 2226,
55
  "num_input_tokens_seen": 0,
56
+ "num_train_epochs": 7,
57
  "save_steps": 500,
58
  "total_flos": 260941334653608.0,
59
  "train_batch_size": 48,
60
  "trial_name": null,
61
  "trial_params": {
62
+ "alpha": 0.45847029900054825,
63
+ "num_train_epochs": 7,
64
+ "temperature": 12
65
  }
66
  }
run-1/checkpoint-1000/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:82393f81e5d40ace9ca220d5be9fbbb0d21ce457cb7dbdf5eae8ad20d9160b86
3
  size 5048
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c6c1680041c2ad9abd833a14af03572062f4c25ba5058b84c7ced516363495d8
3
  size 5048
run-1/checkpoint-1500/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b4e229c1007b6ad2ce215aa4dca164f70eda7ff1625708c647aab32908ba3931
3
  size 268290900
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5fc350ced5969ce561b3410d20de09cf7562da401b84b0ed0134b1437acbfb7d
3
  size 268290900
run-1/checkpoint-1500/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:09c5b3fe155acf25f9d2cd3c58827be9177df6d5c7754c2d79f5f75e48e8897e
3
  size 536643898
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:324c9fb6b2e3ee46b5b1a32e37040174373891985432d194f761d51fc5fefa27
3
  size 536643898
run-1/checkpoint-1500/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d64c2fcc199bcaa9f6cc6a50071234f29de10ac43b1d2efd9b3a1263b2bd2351
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:10247f8a1efaf9e3e35daf770b64f17a24e4101d4e010d50d94597b8e48a5f16
3
  size 1064
run-1/checkpoint-1500/trainer_state.json CHANGED
@@ -10,73 +10,73 @@
10
  "log_history": [
11
  {
12
  "epoch": 1.0,
13
- "eval_accuracy": 0.5835483870967741,
14
- "eval_loss": 0.1925622820854187,
15
- "eval_runtime": 14.4107,
16
- "eval_samples_per_second": 215.117,
17
- "eval_steps_per_second": 26.924,
18
  "step": 318
19
  },
20
  {
21
  "epoch": 1.5723270440251573,
22
- "grad_norm": 0.5316773056983948,
23
- "learning_rate": 1.650593990216632e-05,
24
- "loss": 0.3086,
25
  "step": 500
26
  },
27
  {
28
  "epoch": 2.0,
29
- "eval_accuracy": 0.8203225806451613,
30
- "eval_loss": 0.09446250647306442,
31
- "eval_runtime": 15.4923,
32
- "eval_samples_per_second": 200.1,
33
- "eval_steps_per_second": 25.045,
34
  "step": 636
35
  },
36
  {
37
  "epoch": 3.0,
38
- "eval_accuracy": 0.8858064516129033,
39
- "eval_loss": 0.06262949109077454,
40
- "eval_runtime": 14.8271,
41
- "eval_samples_per_second": 209.076,
42
- "eval_steps_per_second": 26.168,
43
  "step": 954
44
  },
45
  {
46
  "epoch": 3.1446540880503147,
47
- "grad_norm": 0.4893101155757904,
48
- "learning_rate": 1.3011879804332637e-05,
49
- "loss": 0.1087,
50
  "step": 1000
51
  },
52
  {
53
  "epoch": 4.0,
54
- "eval_accuracy": 0.8993548387096775,
55
- "eval_loss": 0.04846852272748947,
56
- "eval_runtime": 15.5296,
57
- "eval_samples_per_second": 199.618,
58
- "eval_steps_per_second": 24.984,
59
  "step": 1272
60
  },
61
  {
62
  "epoch": 4.716981132075472,
63
- "grad_norm": 0.42099490761756897,
64
- "learning_rate": 9.517819706498952e-06,
65
- "loss": 0.0698,
66
  "step": 1500
67
  }
68
  ],
69
  "logging_steps": 500,
70
- "max_steps": 2862,
71
  "num_input_tokens_seen": 0,
72
- "num_train_epochs": 9,
73
  "save_steps": 500,
74
  "total_flos": 391368939443328.0,
75
  "train_batch_size": 48,
76
  "trial_name": null,
77
  "trial_params": {
78
- "alpha": 0.8057539670749069,
79
- "num_train_epochs": 9,
80
- "temperature": 17
81
  }
82
  }
 
10
  "log_history": [
11
  {
12
  "epoch": 1.0,
13
+ "eval_accuracy": 0.5832258064516129,
14
+ "eval_loss": 0.20071792602539062,
15
+ "eval_runtime": 13.9879,
16
+ "eval_samples_per_second": 221.619,
17
+ "eval_steps_per_second": 27.738,
18
  "step": 318
19
  },
20
  {
21
  "epoch": 1.5723270440251573,
22
+ "grad_norm": 0.5363588929176331,
23
+ "learning_rate": 1.550763701707098e-05,
24
+ "loss": 0.319,
25
  "step": 500
26
  },
27
  {
28
  "epoch": 2.0,
29
+ "eval_accuracy": 0.817741935483871,
30
+ "eval_loss": 0.0997093915939331,
31
+ "eval_runtime": 14.8543,
32
+ "eval_samples_per_second": 208.694,
33
+ "eval_steps_per_second": 26.12,
34
  "step": 636
35
  },
36
  {
37
  "epoch": 3.0,
38
+ "eval_accuracy": 0.8796774193548387,
39
+ "eval_loss": 0.06741480529308319,
40
+ "eval_runtime": 14.8004,
41
+ "eval_samples_per_second": 209.454,
42
+ "eval_steps_per_second": 26.216,
43
  "step": 954
44
  },
45
  {
46
  "epoch": 3.1446540880503147,
47
+ "grad_norm": 0.5110210180282593,
48
+ "learning_rate": 1.101527403414196e-05,
49
+ "loss": 0.1149,
50
  "step": 1000
51
  },
52
  {
53
  "epoch": 4.0,
54
+ "eval_accuracy": 0.8958064516129032,
55
+ "eval_loss": 0.05332249775528908,
56
+ "eval_runtime": 15.4008,
57
+ "eval_samples_per_second": 201.288,
58
+ "eval_steps_per_second": 25.193,
59
  "step": 1272
60
  },
61
  {
62
  "epoch": 4.716981132075472,
63
+ "grad_norm": 0.4394994378089905,
64
+ "learning_rate": 6.522911051212939e-06,
65
+ "loss": 0.076,
66
  "step": 1500
67
  }
68
  ],
69
  "logging_steps": 500,
70
+ "max_steps": 2226,
71
  "num_input_tokens_seen": 0,
72
+ "num_train_epochs": 7,
73
  "save_steps": 500,
74
  "total_flos": 391368939443328.0,
75
  "train_batch_size": 48,
76
  "trial_name": null,
77
  "trial_params": {
78
+ "alpha": 0.45847029900054825,
79
+ "num_train_epochs": 7,
80
+ "temperature": 12
81
  }
82
  }
run-1/checkpoint-1500/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:82393f81e5d40ace9ca220d5be9fbbb0d21ce457cb7dbdf5eae8ad20d9160b86
3
  size 5048
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c6c1680041c2ad9abd833a14af03572062f4c25ba5058b84c7ced516363495d8
3
  size 5048
run-1/checkpoint-2000/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:4e5e4f84c73c5834a60fafaf58b21c305ccf17cba848a59f1b2b3c7a562c1f39
3
  size 268290900
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8f31f365c8cfca47c670086bb1001bed52e8c1d972d75627a773daece075694b
3
  size 268290900
run-1/checkpoint-2000/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:0abfae47ccc3b2ac5dce0b2e8c59eb14fd4af416b964760efcf19c8c11b65cfb
3
  size 536643898
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ecb1cbaa7d177727fa64b2f9a62279c842a162c46259dad879df619d19139e69
3
  size 536643898
run-1/checkpoint-2000/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:ae0e33fc92d01a91fed24b0b528de3cb9aef9121ff7d11bd1bac4f5a4898b248
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a711dbdbcc3fb81414c43be534b20600369a5338cb39c6b70af432fa041190bb
3
  size 1064
run-1/checkpoint-2000/trainer_state.json CHANGED
@@ -10,98 +10,98 @@
10
  "log_history": [
11
  {
12
  "epoch": 1.0,
13
- "eval_accuracy": 0.5835483870967741,
14
- "eval_loss": 0.1925622820854187,
15
- "eval_runtime": 14.4107,
16
- "eval_samples_per_second": 215.117,
17
- "eval_steps_per_second": 26.924,
18
  "step": 318
19
  },
20
  {
21
  "epoch": 1.5723270440251573,
22
- "grad_norm": 0.5316773056983948,
23
- "learning_rate": 1.650593990216632e-05,
24
- "loss": 0.3086,
25
  "step": 500
26
  },
27
  {
28
  "epoch": 2.0,
29
- "eval_accuracy": 0.8203225806451613,
30
- "eval_loss": 0.09446250647306442,
31
- "eval_runtime": 15.4923,
32
- "eval_samples_per_second": 200.1,
33
- "eval_steps_per_second": 25.045,
34
  "step": 636
35
  },
36
  {
37
  "epoch": 3.0,
38
- "eval_accuracy": 0.8858064516129033,
39
- "eval_loss": 0.06262949109077454,
40
- "eval_runtime": 14.8271,
41
- "eval_samples_per_second": 209.076,
42
- "eval_steps_per_second": 26.168,
43
  "step": 954
44
  },
45
  {
46
  "epoch": 3.1446540880503147,
47
- "grad_norm": 0.4893101155757904,
48
- "learning_rate": 1.3011879804332637e-05,
49
- "loss": 0.1087,
50
  "step": 1000
51
  },
52
  {
53
  "epoch": 4.0,
54
- "eval_accuracy": 0.8993548387096775,
55
- "eval_loss": 0.04846852272748947,
56
- "eval_runtime": 15.5296,
57
- "eval_samples_per_second": 199.618,
58
- "eval_steps_per_second": 24.984,
59
  "step": 1272
60
  },
61
  {
62
  "epoch": 4.716981132075472,
63
- "grad_norm": 0.42099490761756897,
64
- "learning_rate": 9.517819706498952e-06,
65
- "loss": 0.0698,
66
  "step": 1500
67
  },
68
  {
69
  "epoch": 5.0,
70
- "eval_accuracy": 0.9080645161290323,
71
- "eval_loss": 0.04048996791243553,
72
- "eval_runtime": 14.4172,
73
- "eval_samples_per_second": 215.02,
74
- "eval_steps_per_second": 26.912,
75
  "step": 1590
76
  },
77
  {
78
  "epoch": 6.0,
79
- "eval_accuracy": 0.912258064516129,
80
- "eval_loss": 0.03561786934733391,
81
- "eval_runtime": 14.8396,
82
- "eval_samples_per_second": 208.901,
83
- "eval_steps_per_second": 26.146,
84
  "step": 1908
85
  },
86
  {
87
  "epoch": 6.289308176100629,
88
- "grad_norm": 0.2000160664319992,
89
- "learning_rate": 6.02375960866527e-06,
90
- "loss": 0.0554,
91
  "step": 2000
92
  }
93
  ],
94
  "logging_steps": 500,
95
- "max_steps": 2862,
96
  "num_input_tokens_seen": 0,
97
- "num_train_epochs": 9,
98
  "save_steps": 500,
99
  "total_flos": 520991326672152.0,
100
  "train_batch_size": 48,
101
  "trial_name": null,
102
  "trial_params": {
103
- "alpha": 0.8057539670749069,
104
- "num_train_epochs": 9,
105
- "temperature": 17
106
  }
107
  }
 
10
  "log_history": [
11
  {
12
  "epoch": 1.0,
13
+ "eval_accuracy": 0.5832258064516129,
14
+ "eval_loss": 0.20071792602539062,
15
+ "eval_runtime": 13.9879,
16
+ "eval_samples_per_second": 221.619,
17
+ "eval_steps_per_second": 27.738,
18
  "step": 318
19
  },
20
  {
21
  "epoch": 1.5723270440251573,
22
+ "grad_norm": 0.5363588929176331,
23
+ "learning_rate": 1.550763701707098e-05,
24
+ "loss": 0.319,
25
  "step": 500
26
  },
27
  {
28
  "epoch": 2.0,
29
+ "eval_accuracy": 0.817741935483871,
30
+ "eval_loss": 0.0997093915939331,
31
+ "eval_runtime": 14.8543,
32
+ "eval_samples_per_second": 208.694,
33
+ "eval_steps_per_second": 26.12,
34
  "step": 636
35
  },
36
  {
37
  "epoch": 3.0,
38
+ "eval_accuracy": 0.8796774193548387,
39
+ "eval_loss": 0.06741480529308319,
40
+ "eval_runtime": 14.8004,
41
+ "eval_samples_per_second": 209.454,
42
+ "eval_steps_per_second": 26.216,
43
  "step": 954
44
  },
45
  {
46
  "epoch": 3.1446540880503147,
47
+ "grad_norm": 0.5110210180282593,
48
+ "learning_rate": 1.101527403414196e-05,
49
+ "loss": 0.1149,
50
  "step": 1000
51
  },
52
  {
53
  "epoch": 4.0,
54
+ "eval_accuracy": 0.8958064516129032,
55
+ "eval_loss": 0.05332249775528908,
56
+ "eval_runtime": 15.4008,
57
+ "eval_samples_per_second": 201.288,
58
+ "eval_steps_per_second": 25.193,
59
  "step": 1272
60
  },
61
  {
62
  "epoch": 4.716981132075472,
63
+ "grad_norm": 0.4394994378089905,
64
+ "learning_rate": 6.522911051212939e-06,
65
+ "loss": 0.076,
66
  "step": 1500
67
  },
68
  {
69
  "epoch": 5.0,
70
+ "eval_accuracy": 0.9029032258064517,
71
+ "eval_loss": 0.045641668140888214,
72
+ "eval_runtime": 15.8694,
73
+ "eval_samples_per_second": 195.344,
74
+ "eval_steps_per_second": 24.45,
75
  "step": 1590
76
  },
77
  {
78
  "epoch": 6.0,
79
+ "eval_accuracy": 0.9064516129032258,
80
+ "eval_loss": 0.04183841869235039,
81
+ "eval_runtime": 14.4904,
82
+ "eval_samples_per_second": 213.935,
83
+ "eval_steps_per_second": 26.776,
84
  "step": 1908
85
  },
86
  {
87
  "epoch": 6.289308176100629,
88
+ "grad_norm": 0.24045950174331665,
89
+ "learning_rate": 2.0305480682839176e-06,
90
+ "loss": 0.0628,
91
  "step": 2000
92
  }
93
  ],
94
  "logging_steps": 500,
95
+ "max_steps": 2226,
96
  "num_input_tokens_seen": 0,
97
+ "num_train_epochs": 7,
98
  "save_steps": 500,
99
  "total_flos": 520991326672152.0,
100
  "train_batch_size": 48,
101
  "trial_name": null,
102
  "trial_params": {
103
+ "alpha": 0.45847029900054825,
104
+ "num_train_epochs": 7,
105
+ "temperature": 12
106
  }
107
  }
run-1/checkpoint-2000/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:82393f81e5d40ace9ca220d5be9fbbb0d21ce457cb7dbdf5eae8ad20d9160b86
3
  size 5048
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c6c1680041c2ad9abd833a14af03572062f4c25ba5058b84c7ced516363495d8
3
  size 5048
run-1/checkpoint-500/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:1042d386874e430096f4d5799166c2e2b83c2920239d2fdfe24c7dd2cda8952a
3
  size 268290900
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c9ff7ee88c51305753f34881d5d807803d2ef768fdf00a44f2240dc31c12d143
3
  size 268290900
run-1/checkpoint-500/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:9e11b130ad4f362f5c8a117ca7f59dd31b8fe053e335995042e3bee8205ebfb7
3
  size 536643898
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ed5aa0f69b2d0528b6cde1f2e0cb8bca5fa3309cd55e96158515b886914782b4
3
  size 536643898
run-1/checkpoint-500/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:0d593938cfde58f8053b2c2237865d46847911e8413848d69b02d7ad5d8c4729
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7c898ef245a654a7c97315fc9d4b5879d3c8228b82434bdf35669bf866597c60
3
  size 1064
run-1/checkpoint-500/trainer_state.json CHANGED
@@ -10,32 +10,32 @@
10
  "log_history": [
11
  {
12
  "epoch": 1.0,
13
- "eval_accuracy": 0.5835483870967741,
14
- "eval_loss": 0.1925622820854187,
15
- "eval_runtime": 14.4107,
16
- "eval_samples_per_second": 215.117,
17
- "eval_steps_per_second": 26.924,
18
  "step": 318
19
  },
20
  {
21
  "epoch": 1.5723270440251573,
22
- "grad_norm": 0.5316773056983948,
23
- "learning_rate": 1.650593990216632e-05,
24
- "loss": 0.3086,
25
  "step": 500
26
  }
27
  ],
28
  "logging_steps": 500,
29
- "max_steps": 2862,
30
  "num_input_tokens_seen": 0,
31
- "num_train_epochs": 9,
32
  "save_steps": 500,
33
  "total_flos": 130072209152340.0,
34
  "train_batch_size": 48,
35
  "trial_name": null,
36
  "trial_params": {
37
- "alpha": 0.8057539670749069,
38
- "num_train_epochs": 9,
39
- "temperature": 17
40
  }
41
  }
 
10
  "log_history": [
11
  {
12
  "epoch": 1.0,
13
+ "eval_accuracy": 0.5832258064516129,
14
+ "eval_loss": 0.20071792602539062,
15
+ "eval_runtime": 13.9879,
16
+ "eval_samples_per_second": 221.619,
17
+ "eval_steps_per_second": 27.738,
18
  "step": 318
19
  },
20
  {
21
  "epoch": 1.5723270440251573,
22
+ "grad_norm": 0.5363588929176331,
23
+ "learning_rate": 1.550763701707098e-05,
24
+ "loss": 0.319,
25
  "step": 500
26
  }
27
  ],
28
  "logging_steps": 500,
29
+ "max_steps": 2226,
30
  "num_input_tokens_seen": 0,
31
+ "num_train_epochs": 7,
32
  "save_steps": 500,
33
  "total_flos": 130072209152340.0,
34
  "train_batch_size": 48,
35
  "trial_name": null,
36
  "trial_params": {
37
+ "alpha": 0.45847029900054825,
38
+ "num_train_epochs": 7,
39
+ "temperature": 12
40
  }
41
  }
run-1/checkpoint-500/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:82393f81e5d40ace9ca220d5be9fbbb0d21ce457cb7dbdf5eae8ad20d9160b86
3
  size 5048
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c6c1680041c2ad9abd833a14af03572062f4c25ba5058b84c7ced516363495d8
3
  size 5048
run-2/checkpoint-1000/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:9f388dbab099e3f7f8db7ccd3f5ca759fd1c7096bc7d954af0578e35492a6738
3
  size 268290900
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0fe188f101f50feb9198e95a3dc4c7e41b408f1d4566257a86bbf8d149867e46
3
  size 268290900
run-2/checkpoint-1000/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:bd6a6e541567cfa4dfeff9fe69a4fdba2300924a22471ccc220f6e692fd8ae42
3
  size 536643898
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:be1f42c06e9cd80680a9cd15ef8cf85ddfb7e00ada7b80ee55d20c1adabf6d04
3
  size 536643898
run-2/checkpoint-1000/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f9bc5852f25aced6ff34fc8fb4a6bf0a05a098a614f227bfe9cd5c74e86862eb
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:690dc36074a76886ef61aa14bba7f4a22546d075686d785f211a5a8037fc50f8
3
  size 1064
run-2/checkpoint-1000/trainer_state.json CHANGED
@@ -10,57 +10,57 @@
10
  "log_history": [
11
  {
12
  "epoch": 1.0,
13
- "eval_accuracy": 0.6016129032258064,
14
- "eval_loss": 0.2116461843252182,
15
- "eval_runtime": 16.545,
16
- "eval_samples_per_second": 187.367,
17
- "eval_steps_per_second": 23.451,
18
  "step": 318
19
  },
20
  {
21
  "epoch": 1.5723270440251573,
22
- "grad_norm": 0.5623254179954529,
23
- "learning_rate": 1.685534591194969e-05,
24
- "loss": 0.3397,
25
  "step": 500
26
  },
27
  {
28
  "epoch": 2.0,
29
- "eval_accuracy": 0.8306451612903226,
30
- "eval_loss": 0.0993104875087738,
31
- "eval_runtime": 15.946,
32
- "eval_samples_per_second": 194.406,
33
- "eval_steps_per_second": 24.332,
34
  "step": 636
35
  },
36
  {
37
  "epoch": 3.0,
38
- "eval_accuracy": 0.8909677419354839,
39
- "eval_loss": 0.06301400810480118,
40
- "eval_runtime": 20.5764,
41
- "eval_samples_per_second": 150.658,
42
- "eval_steps_per_second": 18.857,
43
  "step": 954
44
  },
45
  {
46
  "epoch": 3.1446540880503147,
47
- "grad_norm": 0.5391496419906616,
48
- "learning_rate": 1.371069182389937e-05,
49
- "loss": 0.1148,
50
  "step": 1000
51
  }
52
  ],
53
  "logging_steps": 500,
54
- "max_steps": 3180,
55
  "num_input_tokens_seen": 0,
56
- "num_train_epochs": 10,
57
  "save_steps": 500,
58
  "total_flos": 260941334653608.0,
59
  "train_batch_size": 48,
60
  "trial_name": null,
61
  "trial_params": {
62
- "alpha": 0.7461709946754571,
63
- "num_train_epochs": 10,
64
- "temperature": 7
65
  }
66
  }
 
10
  "log_history": [
11
  {
12
  "epoch": 1.0,
13
+ "eval_accuracy": 0.5706451612903226,
14
+ "eval_loss": 0.193728968501091,
15
+ "eval_runtime": 13.9503,
16
+ "eval_samples_per_second": 222.218,
17
+ "eval_steps_per_second": 27.813,
18
  "step": 318
19
  },
20
  {
21
  "epoch": 1.5723270440251573,
22
+ "grad_norm": 0.5241254568099976,
23
+ "learning_rate": 1.550763701707098e-05,
24
+ "loss": 0.3078,
25
  "step": 500
26
  },
27
  {
28
  "epoch": 2.0,
29
+ "eval_accuracy": 0.8125806451612904,
30
+ "eval_loss": 0.09734208881855011,
31
+ "eval_runtime": 15.441,
32
+ "eval_samples_per_second": 200.764,
33
+ "eval_steps_per_second": 25.128,
34
  "step": 636
35
  },
36
  {
37
  "epoch": 3.0,
38
+ "eval_accuracy": 0.8764516129032258,
39
+ "eval_loss": 0.06649673730134964,
40
+ "eval_runtime": 15.2951,
41
+ "eval_samples_per_second": 202.68,
42
+ "eval_steps_per_second": 25.368,
43
  "step": 954
44
  },
45
  {
46
  "epoch": 3.1446540880503147,
47
+ "grad_norm": 0.48943573236465454,
48
+ "learning_rate": 1.101527403414196e-05,
49
+ "loss": 0.1119,
50
  "step": 1000
51
  }
52
  ],
53
  "logging_steps": 500,
54
+ "max_steps": 2226,
55
  "num_input_tokens_seen": 0,
56
+ "num_train_epochs": 7,
57
  "save_steps": 500,
58
  "total_flos": 260941334653608.0,
59
  "train_batch_size": 48,
60
  "trial_name": null,
61
  "trial_params": {
62
+ "alpha": 0.6214130149862211,
63
+ "num_train_epochs": 7,
64
+ "temperature": 20
65
  }
66
  }
run-2/checkpoint-1000/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f9a4d37eb8abbebe4bdd3ca2a0149ca932618c2fed5240fef5e6644698f826fc
3
  size 5048
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e4b95c2d951c8ddeee32e64101bcd9cc3cdfbbcc360a14eb2bcff65f122add0c
3
  size 5048
run-2/checkpoint-1500/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:92aa69c904731bafd74b18cd4ccd0d8070c2d3c344ebb0de74fdff476ad8dc1c
3
  size 268290900
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:088b5f3d69528548b35406acaa3db91dee8992d804676296feaad4872201b67b
3
  size 268290900
run-2/checkpoint-1500/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:33b2f7bf7935db50db0eab455c5e7e9a035b72b72c346ff249928d978975639d
3
  size 536643898
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c40ca0e30e7620794125bfec7c53291f960cf7acc42d62bb271882880164a069
3
  size 536643898
run-2/checkpoint-1500/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:75147a7d799338816ad08144955be6ffec8d6a2b0e3eb02f6e2e0f9c64d66e19
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:10247f8a1efaf9e3e35daf770b64f17a24e4101d4e010d50d94597b8e48a5f16
3
  size 1064
run-2/checkpoint-1500/trainer_state.json CHANGED
@@ -10,73 +10,73 @@
10
  "log_history": [
11
  {
12
  "epoch": 1.0,
13
- "eval_accuracy": 0.6016129032258064,
14
- "eval_loss": 0.2116461843252182,
15
- "eval_runtime": 16.545,
16
- "eval_samples_per_second": 187.367,
17
- "eval_steps_per_second": 23.451,
18
  "step": 318
19
  },
20
  {
21
  "epoch": 1.5723270440251573,
22
- "grad_norm": 0.5623254179954529,
23
- "learning_rate": 1.685534591194969e-05,
24
- "loss": 0.3397,
25
  "step": 500
26
  },
27
  {
28
  "epoch": 2.0,
29
- "eval_accuracy": 0.8306451612903226,
30
- "eval_loss": 0.0993104875087738,
31
- "eval_runtime": 15.946,
32
- "eval_samples_per_second": 194.406,
33
- "eval_steps_per_second": 24.332,
34
  "step": 636
35
  },
36
  {
37
  "epoch": 3.0,
38
- "eval_accuracy": 0.8909677419354839,
39
- "eval_loss": 0.06301400810480118,
40
- "eval_runtime": 20.5764,
41
- "eval_samples_per_second": 150.658,
42
- "eval_steps_per_second": 18.857,
43
  "step": 954
44
  },
45
  {
46
  "epoch": 3.1446540880503147,
47
- "grad_norm": 0.5391496419906616,
48
- "learning_rate": 1.371069182389937e-05,
49
- "loss": 0.1148,
50
  "step": 1000
51
  },
52
  {
53
  "epoch": 4.0,
54
- "eval_accuracy": 0.9045161290322581,
55
- "eval_loss": 0.04739077761769295,
56
- "eval_runtime": 15.9991,
57
- "eval_samples_per_second": 193.761,
58
- "eval_steps_per_second": 24.251,
59
  "step": 1272
60
  },
61
  {
62
  "epoch": 4.716981132075472,
63
- "grad_norm": 0.44755664467811584,
64
- "learning_rate": 1.0566037735849058e-05,
65
- "loss": 0.0708,
66
  "step": 1500
67
  }
68
  ],
69
  "logging_steps": 500,
70
- "max_steps": 3180,
71
  "num_input_tokens_seen": 0,
72
- "num_train_epochs": 10,
73
  "save_steps": 500,
74
  "total_flos": 391368939443328.0,
75
  "train_batch_size": 48,
76
  "trial_name": null,
77
  "trial_params": {
78
- "alpha": 0.7461709946754571,
79
- "num_train_epochs": 10,
80
- "temperature": 7
81
  }
82
  }
 
10
  "log_history": [
11
  {
12
  "epoch": 1.0,
13
+ "eval_accuracy": 0.5706451612903226,
14
+ "eval_loss": 0.193728968501091,
15
+ "eval_runtime": 13.9503,
16
+ "eval_samples_per_second": 222.218,
17
+ "eval_steps_per_second": 27.813,
18
  "step": 318
19
  },
20
  {
21
  "epoch": 1.5723270440251573,
22
+ "grad_norm": 0.5241254568099976,
23
+ "learning_rate": 1.550763701707098e-05,
24
+ "loss": 0.3078,
25
  "step": 500
26
  },
27
  {
28
  "epoch": 2.0,
29
+ "eval_accuracy": 0.8125806451612904,
30
+ "eval_loss": 0.09734208881855011,
31
+ "eval_runtime": 15.441,
32
+ "eval_samples_per_second": 200.764,
33
+ "eval_steps_per_second": 25.128,
34
  "step": 636
35
  },
36
  {
37
  "epoch": 3.0,
38
+ "eval_accuracy": 0.8764516129032258,
39
+ "eval_loss": 0.06649673730134964,
40
+ "eval_runtime": 15.2951,
41
+ "eval_samples_per_second": 202.68,
42
+ "eval_steps_per_second": 25.368,
43
  "step": 954
44
  },
45
  {
46
  "epoch": 3.1446540880503147,
47
+ "grad_norm": 0.48943573236465454,
48
+ "learning_rate": 1.101527403414196e-05,
49
+ "loss": 0.1119,
50
  "step": 1000
51
  },
52
  {
53
  "epoch": 4.0,
54
+ "eval_accuracy": 0.8961290322580645,
55
+ "eval_loss": 0.052904579788446426,
56
+ "eval_runtime": 15.6918,
57
+ "eval_samples_per_second": 197.556,
58
+ "eval_steps_per_second": 24.726,
59
  "step": 1272
60
  },
61
  {
62
  "epoch": 4.716981132075472,
63
+ "grad_norm": 0.4308073818683624,
64
+ "learning_rate": 6.522911051212939e-06,
65
+ "loss": 0.0747,
66
  "step": 1500
67
  }
68
  ],
69
  "logging_steps": 500,
70
+ "max_steps": 2226,
71
  "num_input_tokens_seen": 0,
72
+ "num_train_epochs": 7,
73
  "save_steps": 500,
74
  "total_flos": 391368939443328.0,
75
  "train_batch_size": 48,
76
  "trial_name": null,
77
  "trial_params": {
78
+ "alpha": 0.6214130149862211,
79
+ "num_train_epochs": 7,
80
+ "temperature": 20
81
  }
82
  }
run-2/checkpoint-1500/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f9a4d37eb8abbebe4bdd3ca2a0149ca932618c2fed5240fef5e6644698f826fc
3
  size 5048
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e4b95c2d951c8ddeee32e64101bcd9cc3cdfbbcc360a14eb2bcff65f122add0c
3
  size 5048
run-2/checkpoint-2000/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:222f897576baee6ba12810eb33e1f8a1797da58b51522cb8ee06e398346c7a8f
3
  size 268290900
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3aaecbafc3ed40a9084ebb560e44458e3b65f97f097db4423e14c2a5e9e6d9ad
3
  size 268290900
run-2/checkpoint-2000/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b154d0f7f4a2a66ed74c1a8bb5329af4696b03ab7ea291a036aa85f3b60fd94b
3
  size 536643898
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d3fabec28558416d5dbb27164a4cea59f5112a144c5f425ef7473f151e5e7225
3
  size 536643898
run-2/checkpoint-2000/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c91f3bc1f2f9939a530d06aa58b755ac1447797d2f8372c11bed9a0b328f055d
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a711dbdbcc3fb81414c43be534b20600369a5338cb39c6b70af432fa041190bb
3
  size 1064
run-2/checkpoint-2000/trainer_state.json CHANGED
@@ -10,98 +10,98 @@
10
  "log_history": [
11
  {
12
  "epoch": 1.0,
13
- "eval_accuracy": 0.6016129032258064,
14
- "eval_loss": 0.2116461843252182,
15
- "eval_runtime": 16.545,
16
- "eval_samples_per_second": 187.367,
17
- "eval_steps_per_second": 23.451,
18
  "step": 318
19
  },
20
  {
21
  "epoch": 1.5723270440251573,
22
- "grad_norm": 0.5623254179954529,
23
- "learning_rate": 1.685534591194969e-05,
24
- "loss": 0.3397,
25
  "step": 500
26
  },
27
  {
28
  "epoch": 2.0,
29
- "eval_accuracy": 0.8306451612903226,
30
- "eval_loss": 0.0993104875087738,
31
- "eval_runtime": 15.946,
32
- "eval_samples_per_second": 194.406,
33
- "eval_steps_per_second": 24.332,
34
  "step": 636
35
  },
36
  {
37
  "epoch": 3.0,
38
- "eval_accuracy": 0.8909677419354839,
39
- "eval_loss": 0.06301400810480118,
40
- "eval_runtime": 20.5764,
41
- "eval_samples_per_second": 150.658,
42
- "eval_steps_per_second": 18.857,
43
  "step": 954
44
  },
45
  {
46
  "epoch": 3.1446540880503147,
47
- "grad_norm": 0.5391496419906616,
48
- "learning_rate": 1.371069182389937e-05,
49
- "loss": 0.1148,
50
  "step": 1000
51
  },
52
  {
53
  "epoch": 4.0,
54
- "eval_accuracy": 0.9045161290322581,
55
- "eval_loss": 0.04739077761769295,
56
- "eval_runtime": 15.9991,
57
- "eval_samples_per_second": 193.761,
58
- "eval_steps_per_second": 24.251,
59
  "step": 1272
60
  },
61
  {
62
  "epoch": 4.716981132075472,
63
- "grad_norm": 0.44755664467811584,
64
- "learning_rate": 1.0566037735849058e-05,
65
- "loss": 0.0708,
66
  "step": 1500
67
  },
68
  {
69
  "epoch": 5.0,
70
- "eval_accuracy": 0.9129032258064517,
71
- "eval_loss": 0.038904447108507156,
72
- "eval_runtime": 17.1562,
73
- "eval_samples_per_second": 180.692,
74
- "eval_steps_per_second": 22.616,
75
  "step": 1590
76
  },
77
  {
78
  "epoch": 6.0,
79
- "eval_accuracy": 0.9219354838709677,
80
- "eval_loss": 0.03372984379529953,
81
- "eval_runtime": 18.1417,
82
- "eval_samples_per_second": 170.877,
83
- "eval_steps_per_second": 21.387,
84
  "step": 1908
85
  },
86
  {
87
  "epoch": 6.289308176100629,
88
- "grad_norm": 0.19874028861522675,
89
- "learning_rate": 7.421383647798742e-06,
90
- "loss": 0.0549,
91
  "step": 2000
92
  }
93
  ],
94
  "logging_steps": 500,
95
- "max_steps": 3180,
96
  "num_input_tokens_seen": 0,
97
- "num_train_epochs": 10,
98
  "save_steps": 500,
99
  "total_flos": 520991326672152.0,
100
  "train_batch_size": 48,
101
  "trial_name": null,
102
  "trial_params": {
103
- "alpha": 0.7461709946754571,
104
- "num_train_epochs": 10,
105
- "temperature": 7
106
  }
107
  }
 
10
  "log_history": [
11
  {
12
  "epoch": 1.0,
13
+ "eval_accuracy": 0.5706451612903226,
14
+ "eval_loss": 0.193728968501091,
15
+ "eval_runtime": 13.9503,
16
+ "eval_samples_per_second": 222.218,
17
+ "eval_steps_per_second": 27.813,
18
  "step": 318
19
  },
20
  {
21
  "epoch": 1.5723270440251573,
22
+ "grad_norm": 0.5241254568099976,
23
+ "learning_rate": 1.550763701707098e-05,
24
+ "loss": 0.3078,
25
  "step": 500
26
  },
27
  {
28
  "epoch": 2.0,
29
+ "eval_accuracy": 0.8125806451612904,
30
+ "eval_loss": 0.09734208881855011,
31
+ "eval_runtime": 15.441,
32
+ "eval_samples_per_second": 200.764,
33
+ "eval_steps_per_second": 25.128,
34
  "step": 636
35
  },
36
  {
37
  "epoch": 3.0,
38
+ "eval_accuracy": 0.8764516129032258,
39
+ "eval_loss": 0.06649673730134964,
40
+ "eval_runtime": 15.2951,
41
+ "eval_samples_per_second": 202.68,
42
+ "eval_steps_per_second": 25.368,
43
  "step": 954
44
  },
45
  {
46
  "epoch": 3.1446540880503147,
47
+ "grad_norm": 0.48943573236465454,
48
+ "learning_rate": 1.101527403414196e-05,
49
+ "loss": 0.1119,
50
  "step": 1000
51
  },
52
  {
53
  "epoch": 4.0,
54
+ "eval_accuracy": 0.8961290322580645,
55
+ "eval_loss": 0.052904579788446426,
56
+ "eval_runtime": 15.6918,
57
+ "eval_samples_per_second": 197.556,
58
+ "eval_steps_per_second": 24.726,
59
  "step": 1272
60
  },
61
  {
62
  "epoch": 4.716981132075472,
63
+ "grad_norm": 0.4308073818683624,
64
+ "learning_rate": 6.522911051212939e-06,
65
+ "loss": 0.0747,
66
  "step": 1500
67
  },
68
  {
69
  "epoch": 5.0,
70
+ "eval_accuracy": 0.9025806451612903,
71
+ "eval_loss": 0.04545857757329941,
72
+ "eval_runtime": 16.5544,
73
+ "eval_samples_per_second": 187.262,
74
+ "eval_steps_per_second": 23.438,
75
  "step": 1590
76
  },
77
  {
78
  "epoch": 6.0,
79
+ "eval_accuracy": 0.9048387096774193,
80
+ "eval_loss": 0.04176154360175133,
81
+ "eval_runtime": 15.4351,
82
+ "eval_samples_per_second": 200.841,
83
+ "eval_steps_per_second": 25.138,
84
  "step": 1908
85
  },
86
  {
87
  "epoch": 6.289308176100629,
88
+ "grad_norm": 0.23608292639255524,
89
+ "learning_rate": 2.0305480682839176e-06,
90
+ "loss": 0.0619,
91
  "step": 2000
92
  }
93
  ],
94
  "logging_steps": 500,
95
+ "max_steps": 2226,
96
  "num_input_tokens_seen": 0,
97
+ "num_train_epochs": 7,
98
  "save_steps": 500,
99
  "total_flos": 520991326672152.0,
100
  "train_batch_size": 48,
101
  "trial_name": null,
102
  "trial_params": {
103
+ "alpha": 0.6214130149862211,
104
+ "num_train_epochs": 7,
105
+ "temperature": 20
106
  }
107
  }