inflaton commited on
Commit
c7488e9
1 Parent(s): 0829bc0

Training in progress, step 3000

Browse files
model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:be3fbf39b8eb79e3df9922f6763aa922164fc565b8a550581ae148e0984a5a12
3
  size 1340618660
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a80e1edfa34ca6c9f099aeb1b12228bd8f4731ff771d848cc487600202c82666
3
  size 1340618660
run-3/checkpoint-1500/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:89eb15914f76ab1c090ddced14596edd02819d04b2426bf130e354b731475796
3
  size 1340618660
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c81a9f4242b6291e55b39407144fcc903a35dde7595a8f838278f2c3657269c2
3
  size 1340618660
run-3/checkpoint-1500/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:dbee786d3c9a0815ea0d7ef969f043d24eb48ae5804b355f9ed33f10eb7ac44a
3
  size 2681472237
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:088951e1efcae1165260a06a5f79972a7473991f6a84cbe38dfde9bbc32ab8ce
3
  size 2681472237
run-3/checkpoint-1500/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:68e80399702242050ed41d20e2421017130de64e7657f296dcacd743d4ff3ed7
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bdad94d4629168a2bf6ebe9809c6aa9b7f4c7bb49624257aed0f70092ede3df7
3
  size 14244
run-3/checkpoint-1500/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:4814fc7eacae34322323e9943a8dbf939b3c515892e92f467d342ec927e0fb9f
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c4a08d061763b774416959da40fe22321436bf84b743529ed3e16bd678714ed4
3
  size 1064
run-3/checkpoint-1500/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:004dbac8124de382164fc08dbd1e3ccce3b5d6b42a0c23f879a1925c177310cf
3
  size 5048
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e7873f643bbe0bcde81dd97f76bd7f35cc2bc225794540a61053f0708234413f
3
  size 5048
run-3/checkpoint-2000/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:09f2b67b5f3a2cd1f2760f0a4e1a453f12528f720e5d7cedc155e0ea6d4024da
3
  size 1340618660
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8d7e639e12fa05ab793737459f1a86157dfb76d767f1d5eb19931edb98017649
3
  size 1340618660
run-3/checkpoint-2000/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:97cd17dc794999681684ac54711eabfde63e7faa96162b14835ec207156e6d5b
3
  size 2681472237
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:72206922f1b9081896184e0e81442f186f632d854eaa2a989a764da3289fafd7
3
  size 2681472237
run-3/checkpoint-2000/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:20d32b819bd77eea147826b2a2b808978e80f3625ab8b137fd60442356e76651
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:28b3bc1d93c8e5d77d91605f4094eab1adea91fbfc16b782d04a37130f913871
3
  size 14244
run-3/checkpoint-2000/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:55035159b5a6af76535bc4d44fd557453dc3f7f1512353bc0a247aa969cd1850
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8c5bf5016802e656b127c6e2dfcd024ab9f999b8198f48afb1fbc97e7f70a98a
3
  size 1064
run-3/checkpoint-2000/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:004dbac8124de382164fc08dbd1e3ccce3b5d6b42a0c23f879a1925c177310cf
3
  size 5048
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e7873f643bbe0bcde81dd97f76bd7f35cc2bc225794540a61053f0708234413f
3
  size 5048
run-3/checkpoint-2500/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:eb68d328ea3f6ebed3e168fc0ee2b06496cd90852d4f7324c6d0ea655c77e95b
3
  size 1340618660
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f888cc9b2d239afd1f796ecf442889dcb89847e1551d533f970a04b47784e205
3
  size 1340618660
run-3/checkpoint-2500/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c3450f41e9cea19dc5e8b39da0a52d188b6b8d55ea9b30b45c062b3286bcaacc
3
  size 2681472237
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:833200cadeb92b311e1e85963436525c1884e05d9f5dd523bc334344740d456a
3
  size 2681472237
run-3/checkpoint-2500/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:21b476fecc79abe6524ab1b3c0c4434514088f57ee27622abde280b9e58b5bea
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e901d2a4b6b7576bd2641f76be3ac251aed1d25d8496953ff1ca82ba6468b670
3
  size 14244
run-3/checkpoint-2500/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:3fc38c645558cbc6470f0b5ce18a519170a71e046ac3f47ce7c54069c4e66437
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7420d9f04f235d5ca197b6ebe7418b52a8bf4c738ae163581d29c15a251463e0
3
  size 1064
run-3/checkpoint-2500/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:004dbac8124de382164fc08dbd1e3ccce3b5d6b42a0c23f879a1925c177310cf
3
  size 5048
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e7873f643bbe0bcde81dd97f76bd7f35cc2bc225794540a61053f0708234413f
3
  size 5048
run-3/checkpoint-3000/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:79d8a68457cc61210437b7501c47ff73cacea377d37cc9901e3d08604fa72469
3
  size 1340618660
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a80e1edfa34ca6c9f099aeb1b12228bd8f4731ff771d848cc487600202c82666
3
  size 1340618660
run-3/checkpoint-3000/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f841a449be3a51fdad86ea7c02b4c9f0c691588d900e94ddd05e557618b97477
3
  size 2681472237
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:af1ed3f623429f004ff11ec563af27a5c9e0edc66b8bb942ad852c8b20892568
3
  size 2681472237
run-3/checkpoint-3000/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:46b4bf6486fe129b76796c932cd300ca16a901432ef276571c3d18583e28daa2
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4fda71fd5f2516582f65b43df160d9b94f0e4b0fc28d9135bb5b0ac484f494d5
3
  size 14244
run-3/checkpoint-3000/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:113b90be488fa19725e4fd0544044683f1bda02a0f06b74665ca92b78c642ee8
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f283f1f579c8656f5cf2b68f582eabd3dd9ab3ae1c7e0a2e5b7311b18f0970c5
3
  size 1064
run-3/checkpoint-3000/trainer_state.json CHANGED
@@ -1,7 +1,7 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 4.021447721179625,
5
  "eval_steps": 500,
6
  "global_step": 3000,
7
  "is_hyper_param_search": true,
@@ -9,94 +9,130 @@
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
- "epoch": 0.6702412868632708,
13
- "grad_norm": 13.834343910217285,
14
- "learning_rate": 1.8689758651553552e-05,
15
- "loss": 0.5491,
 
 
 
 
 
 
 
 
 
16
  "step": 500
17
  },
18
  {
19
- "epoch": 1.0,
20
- "eval_accuracy": 0.762319803237915,
21
- "eval_loss": 0.4773792326450348,
22
- "eval_runtime": 8.5342,
23
- "eval_samples_per_second": 349.533,
24
- "eval_steps_per_second": 21.912,
25
  "step": 746
26
  },
27
  {
28
- "epoch": 1.3404825737265416,
29
- "grad_norm": 12.70506477355957,
30
- "learning_rate": 1.7347103576010912e-05,
31
- "loss": 0.4116,
32
  "step": 1000
33
  },
34
  {
35
- "epoch": 2.0,
36
- "eval_accuracy": 0.7696949243545532,
37
- "eval_loss": 0.5922191739082336,
38
- "eval_runtime": 8.5618,
39
- "eval_samples_per_second": 348.409,
40
- "eval_steps_per_second": 21.841,
 
 
 
 
 
 
 
 
 
41
  "step": 1492
42
  },
43
  {
44
- "epoch": 2.0107238605898123,
45
- "grad_norm": 100.83161163330078,
46
- "learning_rate": 1.6004448500468272e-05,
47
- "loss": 0.2993,
48
  "step": 1500
49
  },
50
  {
51
- "epoch": 2.680965147453083,
52
- "grad_norm": 251.75213623046875,
53
- "learning_rate": 1.4661793424925633e-05,
54
- "loss": 0.1136,
 
 
 
 
 
 
 
 
 
55
  "step": 2000
56
  },
57
  {
58
- "epoch": 3.0,
59
- "eval_accuracy": 0.7676835656166077,
60
- "eval_loss": 0.9344700574874878,
61
- "eval_runtime": 8.6072,
62
- "eval_samples_per_second": 346.57,
63
- "eval_steps_per_second": 21.726,
64
  "step": 2238
65
  },
66
  {
67
- "epoch": 3.351206434316354,
68
- "grad_norm": 17.935895919799805,
69
- "learning_rate": 1.3319138349382991e-05,
70
- "loss": 0.1,
71
  "step": 2500
72
  },
73
  {
74
- "epoch": 4.0,
75
- "eval_accuracy": 0.777405321598053,
76
- "eval_loss": 1.230825662612915,
77
- "eval_runtime": 8.5403,
78
- "eval_samples_per_second": 349.285,
79
- "eval_steps_per_second": 21.896,
 
 
 
 
 
 
 
 
 
80
  "step": 2984
81
  },
82
  {
83
- "epoch": 4.021447721179625,
84
- "grad_norm": 0.8948413729667664,
85
- "learning_rate": 1.1976483273840351e-05,
86
- "loss": 0.0715,
87
  "step": 3000
88
  }
89
  ],
90
  "logging_steps": 500,
91
- "max_steps": 7460,
92
  "num_input_tokens_seen": 0,
93
  "num_train_epochs": 10,
94
  "save_steps": 500,
95
- "total_flos": 7104904216534656.0,
96
- "train_batch_size": 16,
97
  "trial_name": null,
98
  "trial_params": {
99
- "learning_rate": 2.0032413727096193e-05,
100
- "per_device_train_batch_size": 16
101
  }
102
  }
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 8.04289544235925,
5
  "eval_steps": 500,
6
  "global_step": 3000,
7
  "is_hyper_param_search": true,
 
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
+ "epoch": 1.0,
13
+ "eval_accuracy": 0.7727120518684387,
14
+ "eval_loss": 0.46057018637657166,
15
+ "eval_runtime": 8.9113,
16
+ "eval_samples_per_second": 334.743,
17
+ "eval_steps_per_second": 20.985,
18
+ "step": 373
19
+ },
20
+ {
21
+ "epoch": 1.3404825737265416,
22
+ "grad_norm": 8.923430442810059,
23
+ "learning_rate": 2.587666867882196e-05,
24
+ "loss": 0.4651,
25
  "step": 500
26
  },
27
  {
28
+ "epoch": 2.0,
29
+ "eval_accuracy": 0.7770700454711914,
30
+ "eval_loss": 0.5375419855117798,
31
+ "eval_runtime": 9.0162,
32
+ "eval_samples_per_second": 330.848,
33
+ "eval_steps_per_second": 20.74,
34
  "step": 746
35
  },
36
  {
37
+ "epoch": 2.680965147453083,
38
+ "grad_norm": 30.215883255004883,
39
+ "learning_rate": 2.187099241275045e-05,
40
+ "loss": 0.176,
41
  "step": 1000
42
  },
43
  {
44
+ "epoch": 3.0,
45
+ "eval_accuracy": 0.7777405381202698,
46
+ "eval_loss": 0.948959469795227,
47
+ "eval_runtime": 9.007,
48
+ "eval_samples_per_second": 331.187,
49
+ "eval_steps_per_second": 20.762,
50
+ "step": 1119
51
+ },
52
+ {
53
+ "epoch": 4.0,
54
+ "eval_accuracy": 0.7760643362998962,
55
+ "eval_loss": 1.1204984188079834,
56
+ "eval_runtime": 9.0575,
57
+ "eval_samples_per_second": 329.342,
58
+ "eval_steps_per_second": 20.646,
59
  "step": 1492
60
  },
61
  {
62
+ "epoch": 4.021447721179625,
63
+ "grad_norm": 0.16835728287696838,
64
+ "learning_rate": 1.7865316146678937e-05,
65
+ "loss": 0.0608,
66
  "step": 1500
67
  },
68
  {
69
+ "epoch": 5.0,
70
+ "eval_accuracy": 0.7787462472915649,
71
+ "eval_loss": 1.4213643074035645,
72
+ "eval_runtime": 9.0611,
73
+ "eval_samples_per_second": 329.211,
74
+ "eval_steps_per_second": 20.638,
75
+ "step": 1865
76
+ },
77
+ {
78
+ "epoch": 5.361930294906166,
79
+ "grad_norm": 0.16181084513664246,
80
+ "learning_rate": 1.3859639880607426e-05,
81
+ "loss": 0.0242,
82
  "step": 2000
83
  },
84
  {
85
+ "epoch": 6.0,
86
+ "eval_accuracy": 0.7784109711647034,
87
+ "eval_loss": 1.251684308052063,
88
+ "eval_runtime": 9.0244,
89
+ "eval_samples_per_second": 330.548,
90
+ "eval_steps_per_second": 20.722,
91
  "step": 2238
92
  },
93
  {
94
+ "epoch": 6.702412868632708,
95
+ "grad_norm": 2.160583734512329,
96
+ "learning_rate": 9.853963614535916e-06,
97
+ "loss": 0.0136,
98
  "step": 2500
99
  },
100
  {
101
+ "epoch": 7.0,
102
+ "eval_accuracy": 0.7753939032554626,
103
+ "eval_loss": 1.3670138120651245,
104
+ "eval_runtime": 9.0134,
105
+ "eval_samples_per_second": 330.951,
106
+ "eval_steps_per_second": 20.747,
107
+ "step": 2611
108
+ },
109
+ {
110
+ "epoch": 8.0,
111
+ "eval_accuracy": 0.7794166803359985,
112
+ "eval_loss": 1.4973613023757935,
113
+ "eval_runtime": 9.0776,
114
+ "eval_samples_per_second": 328.611,
115
+ "eval_steps_per_second": 20.6,
116
  "step": 2984
117
  },
118
  {
119
+ "epoch": 8.04289544235925,
120
+ "grad_norm": 1.054295301437378,
121
+ "learning_rate": 5.848287348464405e-06,
122
+ "loss": 0.0045,
123
  "step": 3000
124
  }
125
  ],
126
  "logging_steps": 500,
127
+ "max_steps": 3730,
128
  "num_input_tokens_seen": 0,
129
  "num_train_epochs": 10,
130
  "save_steps": 500,
131
+ "total_flos": 1.5221292738223464e+16,
132
+ "train_batch_size": 32,
133
  "trial_name": null,
134
  "trial_params": {
135
+ "learning_rate": 2.988234494489347e-05,
136
+ "per_device_train_batch_size": 32
137
  }
138
  }
run-3/checkpoint-3000/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:004dbac8124de382164fc08dbd1e3ccce3b5d6b42a0c23f879a1925c177310cf
3
  size 5048
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e7873f643bbe0bcde81dd97f76bd7f35cc2bc225794540a61053f0708234413f
3
  size 5048
run-3/checkpoint-3500/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:91c1c7872069f1e75b7ad7b474ea5aaff746e00dfcb80ddd397d46c013bb74e1
3
  size 1340618660
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:19572f5d46152241f93d2ce9ea0af917f6d4e31d54467419b231809c5c78820d
3
  size 1340618660
run-3/checkpoint-3500/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:dca0bbb2815780bbc23a24abd6a5572ea4046a6942142b9d44d473ab4cd0ec47
3
  size 2681472237
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:be76882b15fffb0814989424f9f49c8a80185c15fd13bbaa0448e30d2530166f
3
  size 2681472237
run-3/checkpoint-3500/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:30910bf1ca621c472cd5279d1d2caa68bc73d70f3f33bac8a066bc65c9f6f566
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:28276f7287b26243a28773084d3832b67115e4dc44e251cd9263cf32052e7f9a
3
  size 14244
run-3/checkpoint-3500/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:0410e931fd11a01f4886fe28aa9f5d4540d3bc0a753051bcc11735f495689c95
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bf0b5d61230faf94be94de245eb52273abc7e93e8c0e9e2a11b6190fb32d452f
3
  size 1064
run-3/checkpoint-3500/trainer_state.json CHANGED
@@ -1,7 +1,7 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 4.6916890080428955,
5
  "eval_steps": 500,
6
  "global_step": 3500,
7
  "is_hyper_param_search": true,
@@ -9,101 +9,146 @@
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
- "epoch": 0.6702412868632708,
13
- "grad_norm": 13.834343910217285,
14
- "learning_rate": 1.8689758651553552e-05,
15
- "loss": 0.5491,
 
 
 
 
 
 
 
 
 
16
  "step": 500
17
  },
18
  {
19
- "epoch": 1.0,
20
- "eval_accuracy": 0.762319803237915,
21
- "eval_loss": 0.4773792326450348,
22
- "eval_runtime": 8.5342,
23
- "eval_samples_per_second": 349.533,
24
- "eval_steps_per_second": 21.912,
25
  "step": 746
26
  },
27
  {
28
- "epoch": 1.3404825737265416,
29
- "grad_norm": 12.70506477355957,
30
- "learning_rate": 1.7347103576010912e-05,
31
- "loss": 0.4116,
32
  "step": 1000
33
  },
34
  {
35
- "epoch": 2.0,
36
- "eval_accuracy": 0.7696949243545532,
37
- "eval_loss": 0.5922191739082336,
38
- "eval_runtime": 8.5618,
39
- "eval_samples_per_second": 348.409,
40
- "eval_steps_per_second": 21.841,
 
 
 
 
 
 
 
 
 
41
  "step": 1492
42
  },
43
  {
44
- "epoch": 2.0107238605898123,
45
- "grad_norm": 100.83161163330078,
46
- "learning_rate": 1.6004448500468272e-05,
47
- "loss": 0.2993,
48
  "step": 1500
49
  },
50
  {
51
- "epoch": 2.680965147453083,
52
- "grad_norm": 251.75213623046875,
53
- "learning_rate": 1.4661793424925633e-05,
54
- "loss": 0.1136,
 
 
 
 
 
 
 
 
 
55
  "step": 2000
56
  },
57
  {
58
- "epoch": 3.0,
59
- "eval_accuracy": 0.7676835656166077,
60
- "eval_loss": 0.9344700574874878,
61
- "eval_runtime": 8.6072,
62
- "eval_samples_per_second": 346.57,
63
- "eval_steps_per_second": 21.726,
64
  "step": 2238
65
  },
66
  {
67
- "epoch": 3.351206434316354,
68
- "grad_norm": 17.935895919799805,
69
- "learning_rate": 1.3319138349382991e-05,
70
- "loss": 0.1,
71
  "step": 2500
72
  },
73
  {
74
- "epoch": 4.0,
75
- "eval_accuracy": 0.777405321598053,
76
- "eval_loss": 1.230825662612915,
77
- "eval_runtime": 8.5403,
78
- "eval_samples_per_second": 349.285,
79
- "eval_steps_per_second": 21.896,
 
 
 
 
 
 
 
 
 
80
  "step": 2984
81
  },
82
  {
83
- "epoch": 4.021447721179625,
84
- "grad_norm": 0.8948413729667664,
85
- "learning_rate": 1.1976483273840351e-05,
86
- "loss": 0.0715,
87
  "step": 3000
88
  },
89
  {
90
- "epoch": 4.6916890080428955,
91
- "grad_norm": 0.7082040309906006,
92
- "learning_rate": 1.063382819829771e-05,
93
- "loss": 0.0294,
 
 
 
 
 
 
 
 
 
94
  "step": 3500
95
  }
96
  ],
97
  "logging_steps": 500,
98
- "max_steps": 7460,
99
  "num_input_tokens_seen": 0,
100
  "num_train_epochs": 10,
101
  "save_steps": 500,
102
- "total_flos": 8287346384230464.0,
103
- "train_batch_size": 16,
104
  "trial_name": null,
105
  "trial_params": {
106
- "learning_rate": 2.0032413727096193e-05,
107
- "per_device_train_batch_size": 16
108
  }
109
  }
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 9.383378016085791,
5
  "eval_steps": 500,
6
  "global_step": 3500,
7
  "is_hyper_param_search": true,
 
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
+ "epoch": 1.0,
13
+ "eval_accuracy": 0.7727120518684387,
14
+ "eval_loss": 0.46057018637657166,
15
+ "eval_runtime": 8.9113,
16
+ "eval_samples_per_second": 334.743,
17
+ "eval_steps_per_second": 20.985,
18
+ "step": 373
19
+ },
20
+ {
21
+ "epoch": 1.3404825737265416,
22
+ "grad_norm": 8.923430442810059,
23
+ "learning_rate": 2.587666867882196e-05,
24
+ "loss": 0.4651,
25
  "step": 500
26
  },
27
  {
28
+ "epoch": 2.0,
29
+ "eval_accuracy": 0.7770700454711914,
30
+ "eval_loss": 0.5375419855117798,
31
+ "eval_runtime": 9.0162,
32
+ "eval_samples_per_second": 330.848,
33
+ "eval_steps_per_second": 20.74,
34
  "step": 746
35
  },
36
  {
37
+ "epoch": 2.680965147453083,
38
+ "grad_norm": 30.215883255004883,
39
+ "learning_rate": 2.187099241275045e-05,
40
+ "loss": 0.176,
41
  "step": 1000
42
  },
43
  {
44
+ "epoch": 3.0,
45
+ "eval_accuracy": 0.7777405381202698,
46
+ "eval_loss": 0.948959469795227,
47
+ "eval_runtime": 9.007,
48
+ "eval_samples_per_second": 331.187,
49
+ "eval_steps_per_second": 20.762,
50
+ "step": 1119
51
+ },
52
+ {
53
+ "epoch": 4.0,
54
+ "eval_accuracy": 0.7760643362998962,
55
+ "eval_loss": 1.1204984188079834,
56
+ "eval_runtime": 9.0575,
57
+ "eval_samples_per_second": 329.342,
58
+ "eval_steps_per_second": 20.646,
59
  "step": 1492
60
  },
61
  {
62
+ "epoch": 4.021447721179625,
63
+ "grad_norm": 0.16835728287696838,
64
+ "learning_rate": 1.7865316146678937e-05,
65
+ "loss": 0.0608,
66
  "step": 1500
67
  },
68
  {
69
+ "epoch": 5.0,
70
+ "eval_accuracy": 0.7787462472915649,
71
+ "eval_loss": 1.4213643074035645,
72
+ "eval_runtime": 9.0611,
73
+ "eval_samples_per_second": 329.211,
74
+ "eval_steps_per_second": 20.638,
75
+ "step": 1865
76
+ },
77
+ {
78
+ "epoch": 5.361930294906166,
79
+ "grad_norm": 0.16181084513664246,
80
+ "learning_rate": 1.3859639880607426e-05,
81
+ "loss": 0.0242,
82
  "step": 2000
83
  },
84
  {
85
+ "epoch": 6.0,
86
+ "eval_accuracy": 0.7784109711647034,
87
+ "eval_loss": 1.251684308052063,
88
+ "eval_runtime": 9.0244,
89
+ "eval_samples_per_second": 330.548,
90
+ "eval_steps_per_second": 20.722,
91
  "step": 2238
92
  },
93
  {
94
+ "epoch": 6.702412868632708,
95
+ "grad_norm": 2.160583734512329,
96
+ "learning_rate": 9.853963614535916e-06,
97
+ "loss": 0.0136,
98
  "step": 2500
99
  },
100
  {
101
+ "epoch": 7.0,
102
+ "eval_accuracy": 0.7753939032554626,
103
+ "eval_loss": 1.3670138120651245,
104
+ "eval_runtime": 9.0134,
105
+ "eval_samples_per_second": 330.951,
106
+ "eval_steps_per_second": 20.747,
107
+ "step": 2611
108
+ },
109
+ {
110
+ "epoch": 8.0,
111
+ "eval_accuracy": 0.7794166803359985,
112
+ "eval_loss": 1.4973613023757935,
113
+ "eval_runtime": 9.0776,
114
+ "eval_samples_per_second": 328.611,
115
+ "eval_steps_per_second": 20.6,
116
  "step": 2984
117
  },
118
  {
119
+ "epoch": 8.04289544235925,
120
+ "grad_norm": 1.054295301437378,
121
+ "learning_rate": 5.848287348464405e-06,
122
+ "loss": 0.0045,
123
  "step": 3000
124
  },
125
  {
126
+ "epoch": 9.0,
127
+ "eval_accuracy": 0.7797519564628601,
128
+ "eval_loss": 1.4273899793624878,
129
+ "eval_runtime": 9.0355,
130
+ "eval_samples_per_second": 330.142,
131
+ "eval_steps_per_second": 20.696,
132
+ "step": 3357
133
+ },
134
+ {
135
+ "epoch": 9.383378016085791,
136
+ "grad_norm": 0.03208499401807785,
137
+ "learning_rate": 1.8426110823928949e-06,
138
+ "loss": 0.0039,
139
  "step": 3500
140
  }
141
  ],
142
  "logging_steps": 500,
143
+ "max_steps": 3730,
144
  "num_input_tokens_seen": 0,
145
  "num_train_epochs": 10,
146
  "save_steps": 500,
147
+ "total_flos": 1.774860924913164e+16,
148
+ "train_batch_size": 32,
149
  "trial_name": null,
150
  "trial_params": {
151
+ "learning_rate": 2.988234494489347e-05,
152
+ "per_device_train_batch_size": 32
153
  }
154
  }
run-3/checkpoint-3500/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:004dbac8124de382164fc08dbd1e3ccce3b5d6b42a0c23f879a1925c177310cf
3
  size 5048
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e7873f643bbe0bcde81dd97f76bd7f35cc2bc225794540a61053f0708234413f
3
  size 5048