inflaton commited on
Commit
c6eea0c
1 Parent(s): 472b12c

Training in progress, step 6000

Browse files
Files changed (39) hide show
  1. model.safetensors +1 -1
  2. run-3/checkpoint-4000/model.safetensors +1 -1
  3. run-3/checkpoint-4000/optimizer.pt +1 -1
  4. run-3/checkpoint-4000/rng_state.pth +1 -1
  5. run-3/checkpoint-4000/scheduler.pt +1 -1
  6. run-3/checkpoint-4000/training_args.bin +1 -1
  7. run-3/checkpoint-4500/model.safetensors +1 -1
  8. run-3/checkpoint-4500/optimizer.pt +1 -1
  9. run-3/checkpoint-4500/rng_state.pth +1 -1
  10. run-3/checkpoint-4500/scheduler.pt +1 -1
  11. run-3/checkpoint-4500/training_args.bin +1 -1
  12. run-3/checkpoint-5000/model.safetensors +1 -1
  13. run-3/checkpoint-5000/optimizer.pt +1 -1
  14. run-3/checkpoint-5000/rng_state.pth +1 -1
  15. run-3/checkpoint-5000/scheduler.pt +1 -1
  16. run-3/checkpoint-5000/training_args.bin +1 -1
  17. run-3/checkpoint-5500/model.safetensors +1 -1
  18. run-3/checkpoint-5500/optimizer.pt +1 -1
  19. run-3/checkpoint-5500/rng_state.pth +1 -1
  20. run-3/checkpoint-5500/scheduler.pt +1 -1
  21. run-3/checkpoint-5500/training_args.bin +1 -1
  22. run-3/checkpoint-6000/model.safetensors +1 -1
  23. run-3/checkpoint-6000/optimizer.pt +1 -1
  24. run-3/checkpoint-6000/rng_state.pth +1 -1
  25. run-3/checkpoint-6000/scheduler.pt +1 -1
  26. run-3/checkpoint-6000/trainer_state.json +78 -78
  27. run-3/checkpoint-6000/training_args.bin +1 -1
  28. run-3/checkpoint-6500/model.safetensors +1 -1
  29. run-3/checkpoint-6500/optimizer.pt +1 -1
  30. run-3/checkpoint-6500/rng_state.pth +1 -1
  31. run-3/checkpoint-6500/scheduler.pt +1 -1
  32. run-3/checkpoint-6500/trainer_state.json +81 -81
  33. run-3/checkpoint-6500/training_args.bin +1 -1
  34. run-3/checkpoint-7000/model.safetensors +1 -1
  35. run-3/checkpoint-7000/optimizer.pt +1 -1
  36. run-3/checkpoint-7000/rng_state.pth +1 -1
  37. run-3/checkpoint-7000/scheduler.pt +1 -1
  38. run-3/checkpoint-7000/trainer_state.json +89 -89
  39. run-3/checkpoint-7000/training_args.bin +1 -1
model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:db0c754a09ef5d8f060aae9b2d0f9bdc0b4a43e371f56b329e9aad51b41edeb0
3
  size 1340618660
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:357873f897ac28dcafef3fa3fa53be07c2ae1ce0ee7c067b30c199d7205c8456
3
  size 1340618660
run-3/checkpoint-4000/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:8cb85d6c5dfd037550de5a993624ddda492b4b3ed10e4e6eff7bdc98dd3ae2ec
3
  size 1340618660
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cf482efd461a8b92fa2ced7f16699a274ac312a962d4bb81259d9b23caa7fbe1
3
  size 1340618660
run-3/checkpoint-4000/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:6c2bcb3a195fffa158e8cf1058e5ef39756e488fc9278c1a900317f083c30276
3
  size 2681472237
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d2a7bf7ff3c0c0791b258d882964741d4dddac21bd018d264c509ef22e57d0aa
3
  size 2681472237
run-3/checkpoint-4000/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:3e07840293faa26982fb60ecf67927902569af932812ea428cf0afa3a38536eb
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2d65eeaf2dd6e25acaa89c7669e032118233a7db49d07f0c99cc6439d496417b
3
  size 14244
run-3/checkpoint-4000/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:677637e2929bbbe4f92d799cf4a43bf0dab61205c194ca2d4f5d8fe36706666e
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5b70e2b4949e2d396aca261edad5501b5c6cf802168746c892b94d2ef7d820e0
3
  size 1064
run-3/checkpoint-4000/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:004dbac8124de382164fc08dbd1e3ccce3b5d6b42a0c23f879a1925c177310cf
3
  size 5048
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1507950b2d37b737502f824dab70976a7fa7a07f6887612e84989d3ab0cc54db
3
  size 5048
run-3/checkpoint-4500/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:a354e2701ba50b5ecb3a56a0ca351f8d42eb2b55c650361302caa2c3c5b81365
3
  size 1340618660
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:382a8f46eadf767ed8da97df16ca3df66a409ff23b26af6a2745ff907ca25530
3
  size 1340618660
run-3/checkpoint-4500/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:fce0e3a6a7b2934519b9a2af8341c9fe9cc64f7e764ade4fb3be7293e87e7eb1
3
  size 2681472237
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:16b738c555c50c08e23cb5e22cf2b09d6275281380346d11079a373548a519e1
3
  size 2681472237
run-3/checkpoint-4500/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:37fcbed10402ef9ac2d1810bd54915d476a128b155f3eeec1b8589cff633331f
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:53e9fb867ae884a58402e7b3b9a9f22e8f411dc167b418dee588d4b62db82684
3
  size 14244
run-3/checkpoint-4500/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:6d3e8b59116a58d61a6641c90b7c3897245289dacd44c20ed7921faaa48acb38
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:836e3991ee1cc34d19f43898c3076c25b8a0367ad6a1217c062230a80dc79d0d
3
  size 1064
run-3/checkpoint-4500/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:004dbac8124de382164fc08dbd1e3ccce3b5d6b42a0c23f879a1925c177310cf
3
  size 5048
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1507950b2d37b737502f824dab70976a7fa7a07f6887612e84989d3ab0cc54db
3
  size 5048
run-3/checkpoint-5000/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:ab21c214eb2cd60372040c71f173bf312ede2736be1e2387ff13e5dc7836b6eb
3
  size 1340618660
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6f656c5e41f4db468459396594f52951edf346944c022c6a51fe91022d752880
3
  size 1340618660
run-3/checkpoint-5000/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:94ab4dc6647e9e4eff8d9f2d9c74119275c92ae0a357c2099779b0bf6c2a6049
3
  size 2681472237
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9960712477ff26f5f82209fef0cfa0fdc1268e06a394b9a86f091b6cc09276c5
3
  size 2681472237
run-3/checkpoint-5000/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:8c21edb04148a5d8b9d464c6921aa11e6d9df39bf78ac062cee9e3c3142b3ceb
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7464ebba6819e0c68b094da2227ebd7b7e48fa501069e61ff0c479a55d431d86
3
  size 14244
run-3/checkpoint-5000/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:2087477951553f5262693d17ac22e0696624ba770a2c9796b9ec7ca0335b51bf
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:dd000ad96d274229ffdddadc85494aa58efe28c1a588aed7e940403a55b33a50
3
  size 1064
run-3/checkpoint-5000/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:004dbac8124de382164fc08dbd1e3ccce3b5d6b42a0c23f879a1925c177310cf
3
  size 5048
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1507950b2d37b737502f824dab70976a7fa7a07f6887612e84989d3ab0cc54db
3
  size 5048
run-3/checkpoint-5500/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:99c37f6b70d79d8cbf9bc2c478b922e5501693ff444b0fe4687ea02434cd0cd4
3
  size 1340618660
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:59133785062b3f02adc682bde548abd9a14813e20e605e69c971f9e1cf743b44
3
  size 1340618660
run-3/checkpoint-5500/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f42f74147407307478da180f49426713f01c781ff49b66ea33a31455343fa6a3
3
  size 2681472237
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:823a91a9a9f45cb34713826bb763fe820bfb642a342d0db495d61a7afa005c82
3
  size 2681472237
run-3/checkpoint-5500/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:faf66ed3277d116b58c8085fc54f45583cc9a5800fea6c2965c28353e94c4527
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c4c5967a58f1402443b33894cea74c9f032e86a1e8454f41569028ccf79a7622
3
  size 14244
run-3/checkpoint-5500/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:5e1f63da3db24bdeb91b8e242b4bd5d6aa10d806849b0c8bb8a9422156f22406
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:328f66a371abac671a66cc9d36cee10be25fd4d036bad87dd50b1a40a8805410
3
  size 1064
run-3/checkpoint-5500/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:004dbac8124de382164fc08dbd1e3ccce3b5d6b42a0c23f879a1925c177310cf
3
  size 5048
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1507950b2d37b737502f824dab70976a7fa7a07f6887612e84989d3ab0cc54db
3
  size 5048
run-3/checkpoint-6000/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:8bdc3b431ef3ff736397876b609a72442bb798c06e5844cd1db0945018726869
3
  size 1340618660
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:357873f897ac28dcafef3fa3fa53be07c2ae1ce0ee7c067b30c199d7205c8456
3
  size 1340618660
run-3/checkpoint-6000/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:367a2864fe7dec2c0ae5746d69fbd714600342dbbb5d8ea5ffbeed56360b9299
3
  size 2681472237
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3ff50c71b14db9ed2dc9ab3c5630a214d2f8cd30274ef8942b8db0726ea1613c
3
  size 2681472237
run-3/checkpoint-6000/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:ed173a391c403110e59e52f654ba7ec0f0798cdb67f2bdaa89351e70f329acec
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:88211143abc4ea5f4e151fd815af9be01e8a86ec8565449bd20ccf3f1d4ddcb6
3
  size 14244
run-3/checkpoint-6000/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d86f30ce0718f85afdd26864a6e946e0c132a304a354f2c44a2a7c087ffa4087
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5974e9076a3f51360e8ba5d82f806d211f25caaf2ff1e16f0a4a3a32639e126a
3
  size 1064
run-3/checkpoint-6000/trainer_state.json CHANGED
@@ -10,158 +10,158 @@
10
  "log_history": [
11
  {
12
  "epoch": 0.6702412868632708,
13
- "grad_norm": 13.834343910217285,
14
- "learning_rate": 1.8689758651553552e-05,
15
- "loss": 0.5491,
16
  "step": 500
17
  },
18
  {
19
  "epoch": 1.0,
20
- "eval_accuracy": 0.762319803237915,
21
- "eval_loss": 0.4773792326450348,
22
- "eval_runtime": 8.5342,
23
- "eval_samples_per_second": 349.533,
24
- "eval_steps_per_second": 21.912,
25
  "step": 746
26
  },
27
  {
28
  "epoch": 1.3404825737265416,
29
- "grad_norm": 12.70506477355957,
30
- "learning_rate": 1.7347103576010912e-05,
31
- "loss": 0.4116,
32
  "step": 1000
33
  },
34
  {
35
  "epoch": 2.0,
36
- "eval_accuracy": 0.7696949243545532,
37
- "eval_loss": 0.5922191739082336,
38
- "eval_runtime": 8.5618,
39
- "eval_samples_per_second": 348.409,
40
- "eval_steps_per_second": 21.841,
41
  "step": 1492
42
  },
43
  {
44
  "epoch": 2.0107238605898123,
45
- "grad_norm": 100.83161163330078,
46
- "learning_rate": 1.6004448500468272e-05,
47
- "loss": 0.2993,
48
  "step": 1500
49
  },
50
  {
51
  "epoch": 2.680965147453083,
52
- "grad_norm": 251.75213623046875,
53
- "learning_rate": 1.4661793424925633e-05,
54
- "loss": 0.1136,
55
  "step": 2000
56
  },
57
  {
58
  "epoch": 3.0,
59
- "eval_accuracy": 0.7676835656166077,
60
- "eval_loss": 0.9344700574874878,
61
- "eval_runtime": 8.6072,
62
- "eval_samples_per_second": 346.57,
63
- "eval_steps_per_second": 21.726,
64
  "step": 2238
65
  },
66
  {
67
  "epoch": 3.351206434316354,
68
- "grad_norm": 17.935895919799805,
69
- "learning_rate": 1.3319138349382991e-05,
70
- "loss": 0.1,
71
  "step": 2500
72
  },
73
  {
74
  "epoch": 4.0,
75
- "eval_accuracy": 0.777405321598053,
76
- "eval_loss": 1.230825662612915,
77
- "eval_runtime": 8.5403,
78
- "eval_samples_per_second": 349.285,
79
- "eval_steps_per_second": 21.896,
80
  "step": 2984
81
  },
82
  {
83
  "epoch": 4.021447721179625,
84
- "grad_norm": 0.8948413729667664,
85
- "learning_rate": 1.1976483273840351e-05,
86
- "loss": 0.0715,
87
  "step": 3000
88
  },
89
  {
90
  "epoch": 4.6916890080428955,
91
- "grad_norm": 0.7082040309906006,
92
- "learning_rate": 1.063382819829771e-05,
93
- "loss": 0.0294,
94
  "step": 3500
95
  },
96
  {
97
  "epoch": 5.0,
98
- "eval_accuracy": 0.7763996124267578,
99
- "eval_loss": 1.194653034210205,
100
- "eval_runtime": 8.5079,
101
- "eval_samples_per_second": 350.617,
102
- "eval_steps_per_second": 21.98,
103
  "step": 3730
104
  },
105
  {
106
  "epoch": 5.361930294906166,
107
- "grad_norm": 0.04226335510611534,
108
- "learning_rate": 9.29117312275507e-06,
109
- "loss": 0.025,
110
  "step": 4000
111
  },
112
  {
113
  "epoch": 6.0,
114
- "eval_accuracy": 0.7713711261749268,
115
- "eval_loss": 1.458601713180542,
116
- "eval_runtime": 8.5492,
117
- "eval_samples_per_second": 348.92,
118
- "eval_steps_per_second": 21.873,
119
  "step": 4476
120
  },
121
  {
122
  "epoch": 6.032171581769437,
123
- "grad_norm": 1.8958851099014282,
124
- "learning_rate": 7.94851804721243e-06,
125
- "loss": 0.0138,
126
  "step": 4500
127
  },
128
  {
129
  "epoch": 6.702412868632708,
130
- "grad_norm": 0.009470508433878422,
131
- "learning_rate": 6.60586297166979e-06,
132
- "loss": 0.0098,
133
  "step": 5000
134
  },
135
  {
136
  "epoch": 7.0,
137
- "eval_accuracy": 0.7660073637962341,
138
- "eval_loss": 1.4296730756759644,
139
- "eval_runtime": 8.5064,
140
- "eval_samples_per_second": 350.675,
141
- "eval_steps_per_second": 21.983,
142
  "step": 5222
143
  },
144
  {
145
  "epoch": 7.372654155495979,
146
- "grad_norm": 0.5303798317909241,
147
- "learning_rate": 5.26320789612715e-06,
148
- "loss": 0.0083,
149
  "step": 5500
150
  },
151
  {
152
  "epoch": 8.0,
153
- "eval_accuracy": 0.7720415592193604,
154
- "eval_loss": 1.5305761098861694,
155
- "eval_runtime": 8.614,
156
- "eval_samples_per_second": 346.297,
157
- "eval_steps_per_second": 21.709,
158
  "step": 5968
159
  },
160
  {
161
  "epoch": 8.04289544235925,
162
- "grad_norm": 0.8904930949211121,
163
- "learning_rate": 3.920552820584509e-06,
164
- "loss": 0.0044,
165
  "step": 6000
166
  }
167
  ],
@@ -170,11 +170,11 @@
170
  "num_input_tokens_seen": 0,
171
  "num_train_epochs": 10,
172
  "save_steps": 500,
173
- "total_flos": 1.4223015603234576e+16,
174
  "train_batch_size": 16,
175
  "trial_name": null,
176
  "trial_params": {
177
- "learning_rate": 2.0032413727096193e-05,
178
  "per_device_train_batch_size": 16
179
  }
180
  }
 
10
  "log_history": [
11
  {
12
  "epoch": 0.6702412868632708,
13
+ "grad_norm": 5.694277763366699,
14
+ "learning_rate": 2.542005392773407e-05,
15
+ "loss": 0.557,
16
  "step": 500
17
  },
18
  {
19
  "epoch": 1.0,
20
+ "eval_accuracy": 0.7254441976547241,
21
+ "eval_loss": 0.5171247720718384,
22
+ "eval_runtime": 8.773,
23
+ "eval_samples_per_second": 340.02,
24
+ "eval_steps_per_second": 21.315,
25
  "step": 746
26
  },
27
  {
28
  "epoch": 1.3404825737265416,
29
+ "grad_norm": 8.974740028381348,
30
+ "learning_rate": 2.359390062832789e-05,
31
+ "loss": 0.4156,
32
  "step": 1000
33
  },
34
  {
35
  "epoch": 2.0,
36
+ "eval_accuracy": 0.7596379518508911,
37
+ "eval_loss": 0.6025224924087524,
38
+ "eval_runtime": 8.8883,
39
+ "eval_samples_per_second": 335.609,
40
+ "eval_steps_per_second": 21.039,
41
  "step": 1492
42
  },
43
  {
44
  "epoch": 2.0107238605898123,
45
+ "grad_norm": 7.7003068923950195,
46
+ "learning_rate": 2.1767747328921705e-05,
47
+ "loss": 0.2948,
48
  "step": 1500
49
  },
50
  {
51
  "epoch": 2.680965147453083,
52
+ "grad_norm": 20.24570655822754,
53
+ "learning_rate": 1.9941594029515523e-05,
54
+ "loss": 0.1262,
55
  "step": 2000
56
  },
57
  {
58
  "epoch": 3.0,
59
+ "eval_accuracy": 0.7703654170036316,
60
+ "eval_loss": 0.822274386882782,
61
+ "eval_runtime": 8.8709,
62
+ "eval_samples_per_second": 336.267,
63
+ "eval_steps_per_second": 21.08,
64
  "step": 2238
65
  },
66
  {
67
  "epoch": 3.351206434316354,
68
+ "grad_norm": 0.9093023538589478,
69
+ "learning_rate": 1.8115440730109338e-05,
70
+ "loss": 0.1012,
71
  "step": 2500
72
  },
73
  {
74
  "epoch": 4.0,
75
+ "eval_accuracy": 0.7683539986610413,
76
+ "eval_loss": 1.2840174436569214,
77
+ "eval_runtime": 8.9163,
78
+ "eval_samples_per_second": 334.557,
79
+ "eval_steps_per_second": 20.973,
80
  "step": 2984
81
  },
82
  {
83
  "epoch": 4.021447721179625,
84
+ "grad_norm": 29.010135650634766,
85
+ "learning_rate": 1.6289287430703153e-05,
86
+ "loss": 0.0675,
87
  "step": 3000
88
  },
89
  {
90
  "epoch": 4.6916890080428955,
91
+ "grad_norm": 5.461940288543701,
92
+ "learning_rate": 1.4463134131296973e-05,
93
+ "loss": 0.0379,
94
  "step": 3500
95
  },
96
  {
97
  "epoch": 5.0,
98
+ "eval_accuracy": 0.7700302004814148,
99
+ "eval_loss": 1.4166399240493774,
100
+ "eval_runtime": 8.8683,
101
+ "eval_samples_per_second": 336.367,
102
+ "eval_steps_per_second": 21.086,
103
  "step": 3730
104
  },
105
  {
106
  "epoch": 5.361930294906166,
107
+ "grad_norm": 0.004815839231014252,
108
+ "learning_rate": 1.2636980831890788e-05,
109
+ "loss": 0.034,
110
  "step": 4000
111
  },
112
  {
113
  "epoch": 6.0,
114
+ "eval_accuracy": 0.7720415592193604,
115
+ "eval_loss": 1.576446533203125,
116
+ "eval_runtime": 8.9004,
117
+ "eval_samples_per_second": 335.152,
118
+ "eval_steps_per_second": 21.01,
119
  "step": 4476
120
  },
121
  {
122
  "epoch": 6.032171581769437,
123
+ "grad_norm": 0.29464954137802124,
124
+ "learning_rate": 1.0810827532484605e-05,
125
+ "loss": 0.0175,
126
  "step": 4500
127
  },
128
  {
129
  "epoch": 6.702412868632708,
130
+ "grad_norm": 0.010658634826540947,
131
+ "learning_rate": 8.984674233078421e-06,
132
+ "loss": 0.0101,
133
  "step": 5000
134
  },
135
  {
136
  "epoch": 7.0,
137
+ "eval_accuracy": 0.7753939032554626,
138
+ "eval_loss": 1.5760776996612549,
139
+ "eval_runtime": 8.8735,
140
+ "eval_samples_per_second": 336.169,
141
+ "eval_steps_per_second": 21.074,
142
  "step": 5222
143
  },
144
  {
145
  "epoch": 7.372654155495979,
146
+ "grad_norm": 32.647804260253906,
147
+ "learning_rate": 7.158520933672239e-06,
148
+ "loss": 0.0101,
149
  "step": 5500
150
  },
151
  {
152
  "epoch": 8.0,
153
+ "eval_accuracy": 0.7733824849128723,
154
+ "eval_loss": 1.5171312093734741,
155
+ "eval_runtime": 8.9008,
156
+ "eval_samples_per_second": 335.137,
157
+ "eval_steps_per_second": 21.009,
158
  "step": 5968
159
  },
160
  {
161
  "epoch": 8.04289544235925,
162
+ "grad_norm": 0.006861701142042875,
163
+ "learning_rate": 5.332367634266056e-06,
164
+ "loss": 0.0064,
165
  "step": 6000
166
  }
167
  ],
 
170
  "num_input_tokens_seen": 0,
171
  "num_train_epochs": 10,
172
  "save_steps": 500,
173
+ "total_flos": 1.425589883589798e+16,
174
  "train_batch_size": 16,
175
  "trial_name": null,
176
  "trial_params": {
177
+ "learning_rate": 2.7246207227140256e-05,
178
  "per_device_train_batch_size": 16
179
  }
180
  }
run-3/checkpoint-6000/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:004dbac8124de382164fc08dbd1e3ccce3b5d6b42a0c23f879a1925c177310cf
3
  size 5048
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1507950b2d37b737502f824dab70976a7fa7a07f6887612e84989d3ab0cc54db
3
  size 5048
run-3/checkpoint-6500/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:69c288cc5cc6770f23016de36e1ae0c3bc3769e0d0c7be6c0b8ebaf8de955fba
3
  size 1340618660
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:31ec2188671ef60fb5d11cf43ed926a0f8ad799f26919a5a8a6693b9245fc4e5
3
  size 1340618660
run-3/checkpoint-6500/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:86a0e1b745d47c260fc50d564c4b9c1eef56f536944fcc44e5a2a47551b6684b
3
  size 2681472237
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2f6b8b3b7ccb46348a8b4d934977929d9fd3ca1af1cd003a247aa822c1bfb929
3
  size 2681472237
run-3/checkpoint-6500/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:7a549425085619aa35f5dbdda83584f7d7cf88c514830ff63d3b892a419b0845
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f5b81d476f21a8f359e4d3f42b921b67967019e455d484e4de9d785117a493ca
3
  size 14244
run-3/checkpoint-6500/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c68b2737fdedfb089499582882cc05d926adc001a7f35e0cd75dc0c145373fb9
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:773f880e1ac6dc3230d622b85e4abbcadb78c0f0a15af651b1285dedb6e9a315
3
  size 1064
run-3/checkpoint-6500/trainer_state.json CHANGED
@@ -10,165 +10,165 @@
10
  "log_history": [
11
  {
12
  "epoch": 0.6702412868632708,
13
- "grad_norm": 13.834343910217285,
14
- "learning_rate": 1.8689758651553552e-05,
15
- "loss": 0.5491,
16
  "step": 500
17
  },
18
  {
19
  "epoch": 1.0,
20
- "eval_accuracy": 0.762319803237915,
21
- "eval_loss": 0.4773792326450348,
22
- "eval_runtime": 8.5342,
23
- "eval_samples_per_second": 349.533,
24
- "eval_steps_per_second": 21.912,
25
  "step": 746
26
  },
27
  {
28
  "epoch": 1.3404825737265416,
29
- "grad_norm": 12.70506477355957,
30
- "learning_rate": 1.7347103576010912e-05,
31
- "loss": 0.4116,
32
  "step": 1000
33
  },
34
  {
35
  "epoch": 2.0,
36
- "eval_accuracy": 0.7696949243545532,
37
- "eval_loss": 0.5922191739082336,
38
- "eval_runtime": 8.5618,
39
- "eval_samples_per_second": 348.409,
40
- "eval_steps_per_second": 21.841,
41
  "step": 1492
42
  },
43
  {
44
  "epoch": 2.0107238605898123,
45
- "grad_norm": 100.83161163330078,
46
- "learning_rate": 1.6004448500468272e-05,
47
- "loss": 0.2993,
48
  "step": 1500
49
  },
50
  {
51
  "epoch": 2.680965147453083,
52
- "grad_norm": 251.75213623046875,
53
- "learning_rate": 1.4661793424925633e-05,
54
- "loss": 0.1136,
55
  "step": 2000
56
  },
57
  {
58
  "epoch": 3.0,
59
- "eval_accuracy": 0.7676835656166077,
60
- "eval_loss": 0.9344700574874878,
61
- "eval_runtime": 8.6072,
62
- "eval_samples_per_second": 346.57,
63
- "eval_steps_per_second": 21.726,
64
  "step": 2238
65
  },
66
  {
67
  "epoch": 3.351206434316354,
68
- "grad_norm": 17.935895919799805,
69
- "learning_rate": 1.3319138349382991e-05,
70
- "loss": 0.1,
71
  "step": 2500
72
  },
73
  {
74
  "epoch": 4.0,
75
- "eval_accuracy": 0.777405321598053,
76
- "eval_loss": 1.230825662612915,
77
- "eval_runtime": 8.5403,
78
- "eval_samples_per_second": 349.285,
79
- "eval_steps_per_second": 21.896,
80
  "step": 2984
81
  },
82
  {
83
  "epoch": 4.021447721179625,
84
- "grad_norm": 0.8948413729667664,
85
- "learning_rate": 1.1976483273840351e-05,
86
- "loss": 0.0715,
87
  "step": 3000
88
  },
89
  {
90
  "epoch": 4.6916890080428955,
91
- "grad_norm": 0.7082040309906006,
92
- "learning_rate": 1.063382819829771e-05,
93
- "loss": 0.0294,
94
  "step": 3500
95
  },
96
  {
97
  "epoch": 5.0,
98
- "eval_accuracy": 0.7763996124267578,
99
- "eval_loss": 1.194653034210205,
100
- "eval_runtime": 8.5079,
101
- "eval_samples_per_second": 350.617,
102
- "eval_steps_per_second": 21.98,
103
  "step": 3730
104
  },
105
  {
106
  "epoch": 5.361930294906166,
107
- "grad_norm": 0.04226335510611534,
108
- "learning_rate": 9.29117312275507e-06,
109
- "loss": 0.025,
110
  "step": 4000
111
  },
112
  {
113
  "epoch": 6.0,
114
- "eval_accuracy": 0.7713711261749268,
115
- "eval_loss": 1.458601713180542,
116
- "eval_runtime": 8.5492,
117
- "eval_samples_per_second": 348.92,
118
- "eval_steps_per_second": 21.873,
119
  "step": 4476
120
  },
121
  {
122
  "epoch": 6.032171581769437,
123
- "grad_norm": 1.8958851099014282,
124
- "learning_rate": 7.94851804721243e-06,
125
- "loss": 0.0138,
126
  "step": 4500
127
  },
128
  {
129
  "epoch": 6.702412868632708,
130
- "grad_norm": 0.009470508433878422,
131
- "learning_rate": 6.60586297166979e-06,
132
- "loss": 0.0098,
133
  "step": 5000
134
  },
135
  {
136
  "epoch": 7.0,
137
- "eval_accuracy": 0.7660073637962341,
138
- "eval_loss": 1.4296730756759644,
139
- "eval_runtime": 8.5064,
140
- "eval_samples_per_second": 350.675,
141
- "eval_steps_per_second": 21.983,
142
  "step": 5222
143
  },
144
  {
145
  "epoch": 7.372654155495979,
146
- "grad_norm": 0.5303798317909241,
147
- "learning_rate": 5.26320789612715e-06,
148
- "loss": 0.0083,
149
  "step": 5500
150
  },
151
  {
152
  "epoch": 8.0,
153
- "eval_accuracy": 0.7720415592193604,
154
- "eval_loss": 1.5305761098861694,
155
- "eval_runtime": 8.614,
156
- "eval_samples_per_second": 346.297,
157
- "eval_steps_per_second": 21.709,
158
  "step": 5968
159
  },
160
  {
161
  "epoch": 8.04289544235925,
162
- "grad_norm": 0.8904930949211121,
163
- "learning_rate": 3.920552820584509e-06,
164
- "loss": 0.0044,
165
  "step": 6000
166
  },
167
  {
168
  "epoch": 8.71313672922252,
169
- "grad_norm": 15.417210578918457,
170
- "learning_rate": 2.5778977450418694e-06,
171
- "loss": 0.0032,
172
  "step": 6500
173
  }
174
  ],
@@ -177,11 +177,11 @@
177
  "num_input_tokens_seen": 0,
178
  "num_train_epochs": 10,
179
  "save_steps": 500,
180
- "total_flos": 1.540790408249352e+16,
181
  "train_batch_size": 16,
182
  "trial_name": null,
183
  "trial_params": {
184
- "learning_rate": 2.0032413727096193e-05,
185
  "per_device_train_batch_size": 16
186
  }
187
  }
 
10
  "log_history": [
11
  {
12
  "epoch": 0.6702412868632708,
13
+ "grad_norm": 5.694277763366699,
14
+ "learning_rate": 2.542005392773407e-05,
15
+ "loss": 0.557,
16
  "step": 500
17
  },
18
  {
19
  "epoch": 1.0,
20
+ "eval_accuracy": 0.7254441976547241,
21
+ "eval_loss": 0.5171247720718384,
22
+ "eval_runtime": 8.773,
23
+ "eval_samples_per_second": 340.02,
24
+ "eval_steps_per_second": 21.315,
25
  "step": 746
26
  },
27
  {
28
  "epoch": 1.3404825737265416,
29
+ "grad_norm": 8.974740028381348,
30
+ "learning_rate": 2.359390062832789e-05,
31
+ "loss": 0.4156,
32
  "step": 1000
33
  },
34
  {
35
  "epoch": 2.0,
36
+ "eval_accuracy": 0.7596379518508911,
37
+ "eval_loss": 0.6025224924087524,
38
+ "eval_runtime": 8.8883,
39
+ "eval_samples_per_second": 335.609,
40
+ "eval_steps_per_second": 21.039,
41
  "step": 1492
42
  },
43
  {
44
  "epoch": 2.0107238605898123,
45
+ "grad_norm": 7.7003068923950195,
46
+ "learning_rate": 2.1767747328921705e-05,
47
+ "loss": 0.2948,
48
  "step": 1500
49
  },
50
  {
51
  "epoch": 2.680965147453083,
52
+ "grad_norm": 20.24570655822754,
53
+ "learning_rate": 1.9941594029515523e-05,
54
+ "loss": 0.1262,
55
  "step": 2000
56
  },
57
  {
58
  "epoch": 3.0,
59
+ "eval_accuracy": 0.7703654170036316,
60
+ "eval_loss": 0.822274386882782,
61
+ "eval_runtime": 8.8709,
62
+ "eval_samples_per_second": 336.267,
63
+ "eval_steps_per_second": 21.08,
64
  "step": 2238
65
  },
66
  {
67
  "epoch": 3.351206434316354,
68
+ "grad_norm": 0.9093023538589478,
69
+ "learning_rate": 1.8115440730109338e-05,
70
+ "loss": 0.1012,
71
  "step": 2500
72
  },
73
  {
74
  "epoch": 4.0,
75
+ "eval_accuracy": 0.7683539986610413,
76
+ "eval_loss": 1.2840174436569214,
77
+ "eval_runtime": 8.9163,
78
+ "eval_samples_per_second": 334.557,
79
+ "eval_steps_per_second": 20.973,
80
  "step": 2984
81
  },
82
  {
83
  "epoch": 4.021447721179625,
84
+ "grad_norm": 29.010135650634766,
85
+ "learning_rate": 1.6289287430703153e-05,
86
+ "loss": 0.0675,
87
  "step": 3000
88
  },
89
  {
90
  "epoch": 4.6916890080428955,
91
+ "grad_norm": 5.461940288543701,
92
+ "learning_rate": 1.4463134131296973e-05,
93
+ "loss": 0.0379,
94
  "step": 3500
95
  },
96
  {
97
  "epoch": 5.0,
98
+ "eval_accuracy": 0.7700302004814148,
99
+ "eval_loss": 1.4166399240493774,
100
+ "eval_runtime": 8.8683,
101
+ "eval_samples_per_second": 336.367,
102
+ "eval_steps_per_second": 21.086,
103
  "step": 3730
104
  },
105
  {
106
  "epoch": 5.361930294906166,
107
+ "grad_norm": 0.004815839231014252,
108
+ "learning_rate": 1.2636980831890788e-05,
109
+ "loss": 0.034,
110
  "step": 4000
111
  },
112
  {
113
  "epoch": 6.0,
114
+ "eval_accuracy": 0.7720415592193604,
115
+ "eval_loss": 1.576446533203125,
116
+ "eval_runtime": 8.9004,
117
+ "eval_samples_per_second": 335.152,
118
+ "eval_steps_per_second": 21.01,
119
  "step": 4476
120
  },
121
  {
122
  "epoch": 6.032171581769437,
123
+ "grad_norm": 0.29464954137802124,
124
+ "learning_rate": 1.0810827532484605e-05,
125
+ "loss": 0.0175,
126
  "step": 4500
127
  },
128
  {
129
  "epoch": 6.702412868632708,
130
+ "grad_norm": 0.010658634826540947,
131
+ "learning_rate": 8.984674233078421e-06,
132
+ "loss": 0.0101,
133
  "step": 5000
134
  },
135
  {
136
  "epoch": 7.0,
137
+ "eval_accuracy": 0.7753939032554626,
138
+ "eval_loss": 1.5760776996612549,
139
+ "eval_runtime": 8.8735,
140
+ "eval_samples_per_second": 336.169,
141
+ "eval_steps_per_second": 21.074,
142
  "step": 5222
143
  },
144
  {
145
  "epoch": 7.372654155495979,
146
+ "grad_norm": 32.647804260253906,
147
+ "learning_rate": 7.158520933672239e-06,
148
+ "loss": 0.0101,
149
  "step": 5500
150
  },
151
  {
152
  "epoch": 8.0,
153
+ "eval_accuracy": 0.7733824849128723,
154
+ "eval_loss": 1.5171312093734741,
155
+ "eval_runtime": 8.9008,
156
+ "eval_samples_per_second": 335.137,
157
+ "eval_steps_per_second": 21.009,
158
  "step": 5968
159
  },
160
  {
161
  "epoch": 8.04289544235925,
162
+ "grad_norm": 0.006861701142042875,
163
+ "learning_rate": 5.332367634266056e-06,
164
+ "loss": 0.0064,
165
  "step": 6000
166
  },
167
  {
168
  "epoch": 8.71313672922252,
169
+ "grad_norm": 1.2482685633585788e-05,
170
+ "learning_rate": 3.5062143348598724e-06,
171
+ "loss": 0.0028,
172
  "step": 6500
173
  }
174
  ],
 
177
  "num_input_tokens_seen": 0,
178
  "num_train_epochs": 10,
179
  "save_steps": 500,
180
+ "total_flos": 1.5455057465941884e+16,
181
  "train_batch_size": 16,
182
  "trial_name": null,
183
  "trial_params": {
184
+ "learning_rate": 2.7246207227140256e-05,
185
  "per_device_train_batch_size": 16
186
  }
187
  }
run-3/checkpoint-6500/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:004dbac8124de382164fc08dbd1e3ccce3b5d6b42a0c23f879a1925c177310cf
3
  size 5048
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1507950b2d37b737502f824dab70976a7fa7a07f6887612e84989d3ab0cc54db
3
  size 5048
run-3/checkpoint-7000/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:ede137ed9f75602fcd765b9df81724cd345e4e8fba7631f5459f47d1e14017b4
3
  size 1340618660
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3c4a8a041c3286f74682e35df552e9fc99c021008e7d2b87738f3fd82618c362
3
  size 1340618660
run-3/checkpoint-7000/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:139a24ccc36cc750e5d554d1c29a253bda9deec8cdd5d9d1183530c461b598b5
3
  size 2681472237
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8cde3998752916ce1a2e2a136402465ce51c3e96c07515ddae0b4246d99415b6
3
  size 2681472237
run-3/checkpoint-7000/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:eb8d864237da7cff0983198840347a22501258be6388b7177076e08d5c5f5afb
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2d9034b167dd57fcb13ab131f7a6b12c7467166bd3b6746d19284f0b0fe4a597
3
  size 14244
run-3/checkpoint-7000/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:79fc7cffde8849d794c3ffd19c096c010d6994fc14914d10c2a72faedff0abcd
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:089dd2cbb01d7d635a13cc85346cf219731871b043661ac60d6bbf8e6d664db2
3
  size 1064
run-3/checkpoint-7000/trainer_state.json CHANGED
@@ -10,181 +10,181 @@
10
  "log_history": [
11
  {
12
  "epoch": 0.6702412868632708,
13
- "grad_norm": 13.834343910217285,
14
- "learning_rate": 1.8689758651553552e-05,
15
- "loss": 0.5491,
16
  "step": 500
17
  },
18
  {
19
  "epoch": 1.0,
20
- "eval_accuracy": 0.762319803237915,
21
- "eval_loss": 0.4773792326450348,
22
- "eval_runtime": 8.5342,
23
- "eval_samples_per_second": 349.533,
24
- "eval_steps_per_second": 21.912,
25
  "step": 746
26
  },
27
  {
28
  "epoch": 1.3404825737265416,
29
- "grad_norm": 12.70506477355957,
30
- "learning_rate": 1.7347103576010912e-05,
31
- "loss": 0.4116,
32
  "step": 1000
33
  },
34
  {
35
  "epoch": 2.0,
36
- "eval_accuracy": 0.7696949243545532,
37
- "eval_loss": 0.5922191739082336,
38
- "eval_runtime": 8.5618,
39
- "eval_samples_per_second": 348.409,
40
- "eval_steps_per_second": 21.841,
41
  "step": 1492
42
  },
43
  {
44
  "epoch": 2.0107238605898123,
45
- "grad_norm": 100.83161163330078,
46
- "learning_rate": 1.6004448500468272e-05,
47
- "loss": 0.2993,
48
  "step": 1500
49
  },
50
  {
51
  "epoch": 2.680965147453083,
52
- "grad_norm": 251.75213623046875,
53
- "learning_rate": 1.4661793424925633e-05,
54
- "loss": 0.1136,
55
  "step": 2000
56
  },
57
  {
58
  "epoch": 3.0,
59
- "eval_accuracy": 0.7676835656166077,
60
- "eval_loss": 0.9344700574874878,
61
- "eval_runtime": 8.6072,
62
- "eval_samples_per_second": 346.57,
63
- "eval_steps_per_second": 21.726,
64
  "step": 2238
65
  },
66
  {
67
  "epoch": 3.351206434316354,
68
- "grad_norm": 17.935895919799805,
69
- "learning_rate": 1.3319138349382991e-05,
70
- "loss": 0.1,
71
  "step": 2500
72
  },
73
  {
74
  "epoch": 4.0,
75
- "eval_accuracy": 0.777405321598053,
76
- "eval_loss": 1.230825662612915,
77
- "eval_runtime": 8.5403,
78
- "eval_samples_per_second": 349.285,
79
- "eval_steps_per_second": 21.896,
80
  "step": 2984
81
  },
82
  {
83
  "epoch": 4.021447721179625,
84
- "grad_norm": 0.8948413729667664,
85
- "learning_rate": 1.1976483273840351e-05,
86
- "loss": 0.0715,
87
  "step": 3000
88
  },
89
  {
90
  "epoch": 4.6916890080428955,
91
- "grad_norm": 0.7082040309906006,
92
- "learning_rate": 1.063382819829771e-05,
93
- "loss": 0.0294,
94
  "step": 3500
95
  },
96
  {
97
  "epoch": 5.0,
98
- "eval_accuracy": 0.7763996124267578,
99
- "eval_loss": 1.194653034210205,
100
- "eval_runtime": 8.5079,
101
- "eval_samples_per_second": 350.617,
102
- "eval_steps_per_second": 21.98,
103
  "step": 3730
104
  },
105
  {
106
  "epoch": 5.361930294906166,
107
- "grad_norm": 0.04226335510611534,
108
- "learning_rate": 9.29117312275507e-06,
109
- "loss": 0.025,
110
  "step": 4000
111
  },
112
  {
113
  "epoch": 6.0,
114
- "eval_accuracy": 0.7713711261749268,
115
- "eval_loss": 1.458601713180542,
116
- "eval_runtime": 8.5492,
117
- "eval_samples_per_second": 348.92,
118
- "eval_steps_per_second": 21.873,
119
  "step": 4476
120
  },
121
  {
122
  "epoch": 6.032171581769437,
123
- "grad_norm": 1.8958851099014282,
124
- "learning_rate": 7.94851804721243e-06,
125
- "loss": 0.0138,
126
  "step": 4500
127
  },
128
  {
129
  "epoch": 6.702412868632708,
130
- "grad_norm": 0.009470508433878422,
131
- "learning_rate": 6.60586297166979e-06,
132
- "loss": 0.0098,
133
  "step": 5000
134
  },
135
  {
136
  "epoch": 7.0,
137
- "eval_accuracy": 0.7660073637962341,
138
- "eval_loss": 1.4296730756759644,
139
- "eval_runtime": 8.5064,
140
- "eval_samples_per_second": 350.675,
141
- "eval_steps_per_second": 21.983,
142
  "step": 5222
143
  },
144
  {
145
  "epoch": 7.372654155495979,
146
- "grad_norm": 0.5303798317909241,
147
- "learning_rate": 5.26320789612715e-06,
148
- "loss": 0.0083,
149
  "step": 5500
150
  },
151
  {
152
  "epoch": 8.0,
153
- "eval_accuracy": 0.7720415592193604,
154
- "eval_loss": 1.5305761098861694,
155
- "eval_runtime": 8.614,
156
- "eval_samples_per_second": 346.297,
157
- "eval_steps_per_second": 21.709,
158
  "step": 5968
159
  },
160
  {
161
  "epoch": 8.04289544235925,
162
- "grad_norm": 0.8904930949211121,
163
- "learning_rate": 3.920552820584509e-06,
164
- "loss": 0.0044,
165
  "step": 6000
166
  },
167
  {
168
  "epoch": 8.71313672922252,
169
- "grad_norm": 15.417210578918457,
170
- "learning_rate": 2.5778977450418694e-06,
171
- "loss": 0.0032,
172
  "step": 6500
173
  },
174
  {
175
  "epoch": 9.0,
176
- "eval_accuracy": 0.7740529775619507,
177
- "eval_loss": 1.7053203582763672,
178
- "eval_runtime": 8.5011,
179
- "eval_samples_per_second": 350.897,
180
- "eval_steps_per_second": 21.997,
181
  "step": 6714
182
  },
183
  {
184
  "epoch": 9.383378016085791,
185
- "grad_norm": 7.497359911212698e-05,
186
- "learning_rate": 1.235242669499229e-06,
187
- "loss": 0.0028,
188
  "step": 7000
189
  }
190
  ],
@@ -193,11 +193,11 @@
193
  "num_input_tokens_seen": 0,
194
  "num_train_epochs": 10,
195
  "save_steps": 500,
196
- "total_flos": 1.659103427531646e+16,
197
  "train_batch_size": 16,
198
  "trial_name": null,
199
  "trial_params": {
200
- "learning_rate": 2.0032413727096193e-05,
201
  "per_device_train_batch_size": 16
202
  }
203
  }
 
10
  "log_history": [
11
  {
12
  "epoch": 0.6702412868632708,
13
+ "grad_norm": 5.694277763366699,
14
+ "learning_rate": 2.542005392773407e-05,
15
+ "loss": 0.557,
16
  "step": 500
17
  },
18
  {
19
  "epoch": 1.0,
20
+ "eval_accuracy": 0.7254441976547241,
21
+ "eval_loss": 0.5171247720718384,
22
+ "eval_runtime": 8.773,
23
+ "eval_samples_per_second": 340.02,
24
+ "eval_steps_per_second": 21.315,
25
  "step": 746
26
  },
27
  {
28
  "epoch": 1.3404825737265416,
29
+ "grad_norm": 8.974740028381348,
30
+ "learning_rate": 2.359390062832789e-05,
31
+ "loss": 0.4156,
32
  "step": 1000
33
  },
34
  {
35
  "epoch": 2.0,
36
+ "eval_accuracy": 0.7596379518508911,
37
+ "eval_loss": 0.6025224924087524,
38
+ "eval_runtime": 8.8883,
39
+ "eval_samples_per_second": 335.609,
40
+ "eval_steps_per_second": 21.039,
41
  "step": 1492
42
  },
43
  {
44
  "epoch": 2.0107238605898123,
45
+ "grad_norm": 7.7003068923950195,
46
+ "learning_rate": 2.1767747328921705e-05,
47
+ "loss": 0.2948,
48
  "step": 1500
49
  },
50
  {
51
  "epoch": 2.680965147453083,
52
+ "grad_norm": 20.24570655822754,
53
+ "learning_rate": 1.9941594029515523e-05,
54
+ "loss": 0.1262,
55
  "step": 2000
56
  },
57
  {
58
  "epoch": 3.0,
59
+ "eval_accuracy": 0.7703654170036316,
60
+ "eval_loss": 0.822274386882782,
61
+ "eval_runtime": 8.8709,
62
+ "eval_samples_per_second": 336.267,
63
+ "eval_steps_per_second": 21.08,
64
  "step": 2238
65
  },
66
  {
67
  "epoch": 3.351206434316354,
68
+ "grad_norm": 0.9093023538589478,
69
+ "learning_rate": 1.8115440730109338e-05,
70
+ "loss": 0.1012,
71
  "step": 2500
72
  },
73
  {
74
  "epoch": 4.0,
75
+ "eval_accuracy": 0.7683539986610413,
76
+ "eval_loss": 1.2840174436569214,
77
+ "eval_runtime": 8.9163,
78
+ "eval_samples_per_second": 334.557,
79
+ "eval_steps_per_second": 20.973,
80
  "step": 2984
81
  },
82
  {
83
  "epoch": 4.021447721179625,
84
+ "grad_norm": 29.010135650634766,
85
+ "learning_rate": 1.6289287430703153e-05,
86
+ "loss": 0.0675,
87
  "step": 3000
88
  },
89
  {
90
  "epoch": 4.6916890080428955,
91
+ "grad_norm": 5.461940288543701,
92
+ "learning_rate": 1.4463134131296973e-05,
93
+ "loss": 0.0379,
94
  "step": 3500
95
  },
96
  {
97
  "epoch": 5.0,
98
+ "eval_accuracy": 0.7700302004814148,
99
+ "eval_loss": 1.4166399240493774,
100
+ "eval_runtime": 8.8683,
101
+ "eval_samples_per_second": 336.367,
102
+ "eval_steps_per_second": 21.086,
103
  "step": 3730
104
  },
105
  {
106
  "epoch": 5.361930294906166,
107
+ "grad_norm": 0.004815839231014252,
108
+ "learning_rate": 1.2636980831890788e-05,
109
+ "loss": 0.034,
110
  "step": 4000
111
  },
112
  {
113
  "epoch": 6.0,
114
+ "eval_accuracy": 0.7720415592193604,
115
+ "eval_loss": 1.576446533203125,
116
+ "eval_runtime": 8.9004,
117
+ "eval_samples_per_second": 335.152,
118
+ "eval_steps_per_second": 21.01,
119
  "step": 4476
120
  },
121
  {
122
  "epoch": 6.032171581769437,
123
+ "grad_norm": 0.29464954137802124,
124
+ "learning_rate": 1.0810827532484605e-05,
125
+ "loss": 0.0175,
126
  "step": 4500
127
  },
128
  {
129
  "epoch": 6.702412868632708,
130
+ "grad_norm": 0.010658634826540947,
131
+ "learning_rate": 8.984674233078421e-06,
132
+ "loss": 0.0101,
133
  "step": 5000
134
  },
135
  {
136
  "epoch": 7.0,
137
+ "eval_accuracy": 0.7753939032554626,
138
+ "eval_loss": 1.5760776996612549,
139
+ "eval_runtime": 8.8735,
140
+ "eval_samples_per_second": 336.169,
141
+ "eval_steps_per_second": 21.074,
142
  "step": 5222
143
  },
144
  {
145
  "epoch": 7.372654155495979,
146
+ "grad_norm": 32.647804260253906,
147
+ "learning_rate": 7.158520933672239e-06,
148
+ "loss": 0.0101,
149
  "step": 5500
150
  },
151
  {
152
  "epoch": 8.0,
153
+ "eval_accuracy": 0.7733824849128723,
154
+ "eval_loss": 1.5171312093734741,
155
+ "eval_runtime": 8.9008,
156
+ "eval_samples_per_second": 335.137,
157
+ "eval_steps_per_second": 21.009,
158
  "step": 5968
159
  },
160
  {
161
  "epoch": 8.04289544235925,
162
+ "grad_norm": 0.006861701142042875,
163
+ "learning_rate": 5.332367634266056e-06,
164
+ "loss": 0.0064,
165
  "step": 6000
166
  },
167
  {
168
  "epoch": 8.71313672922252,
169
+ "grad_norm": 1.2482685633585788e-05,
170
+ "learning_rate": 3.5062143348598724e-06,
171
+ "loss": 0.0028,
172
  "step": 6500
173
  },
174
  {
175
  "epoch": 9.0,
176
+ "eval_accuracy": 0.7717063426971436,
177
+ "eval_loss": 1.7022887468338013,
178
+ "eval_runtime": 8.8529,
179
+ "eval_samples_per_second": 336.951,
180
+ "eval_steps_per_second": 21.123,
181
  "step": 6714
182
  },
183
  {
184
  "epoch": 9.383378016085791,
185
+ "grad_norm": 0.0012023162562400103,
186
+ "learning_rate": 1.6800610354536887e-06,
187
+ "loss": 0.0055,
188
  "step": 7000
189
  }
190
  ],
 
193
  "num_input_tokens_seen": 0,
194
  "num_train_epochs": 10,
195
  "save_steps": 500,
196
+ "total_flos": 1.6633284114604788e+16,
197
  "train_batch_size": 16,
198
  "trial_name": null,
199
  "trial_params": {
200
+ "learning_rate": 2.7246207227140256e-05,
201
  "per_device_train_batch_size": 16
202
  }
203
  }
run-3/checkpoint-7000/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:004dbac8124de382164fc08dbd1e3ccce3b5d6b42a0c23f879a1925c177310cf
3
  size 5048
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1507950b2d37b737502f824dab70976a7fa7a07f6887612e84989d3ab0cc54db
3
  size 5048