inflaton commited on
Commit
22fdfd2
1 Parent(s): 8619e88

Training in progress, step 500

Browse files
model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:19c2e9b0d7dc747a8d8d78f5dc799a8808ec878848fb8586ab212cf9c8c81ffb
3
  size 1340618660
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5bccd83274c04304a07fc0d7ab77b03e53a90770efa631006c5e6779abee5946
3
  size 1340618660
run-1/checkpoint-3000/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c9d4fbb66e17a7ca5073ecf2db474ed61313a88e550f6f935e7f4d9d7f8ea2ac
3
  size 1340618660
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:30b67bd5d070aa809033f8ed851c8c3a9a83bd1760ad2b9ea831dc148d4ca188
3
  size 1340618660
run-1/checkpoint-3000/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:415abc2ced34eb3b621e57092815018ef1e5865509f80dfca18ba4b95c232766
3
  size 2681472237
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d1dc661514cdc6906f5a028903db1b84735bb0338d90ccfa15e78848cd1c6fab
3
  size 2681472237
run-1/checkpoint-3000/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:6db288463401544c237747deaf541285079a7fac434ce2c87fca0252ed44b357
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:878fd85ac5cda037d90d9829e34a480865a8673564905ce79142723b0e6d8190
3
  size 14244
run-1/checkpoint-3000/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:7cd5355a600c82bb8b4d1dfcb84525d348bfcb0b5974d09401599f4eb9728b3b
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:468ac2670c46b5a2bde0dfbd577346b71c90c7f30c911c38763ccb829951016d
3
  size 1064
run-1/checkpoint-3000/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:0d4b8161881416da68cca372d0db6cfca426bea24c4463a15d5c25fafd1f2c02
3
  size 5048
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cb93c6c6aec435a71a91d9e9d85d4dd9cc0b1363951552ae02212d34fb657545
3
  size 5048
run-1/checkpoint-3500/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:0ff33e8f4ff43e28961c44b20e3cf48041ce3f8b1a3235983430edae5d176c30
3
  size 1340618660
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c872abd773951f54b7207dfb03fbdfcc08d6b72c29d12db38a2fb57c589b4e79
3
  size 1340618660
run-1/checkpoint-3500/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:1bbad93364e29d23d798aeef7cbe53f2874f8dbfdf8d5b90226827e89b12f077
3
  size 2681472237
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c51465a72980f347e384fdc30b5c0fdf422a717189b8293f405954bc65ae63b8
3
  size 2681472237
run-1/checkpoint-3500/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:3f4337773f0e3eb3ebd819153d5c9547815b2eb781fb1b3b8acb5f2d3c9910ed
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f89a7fe4a55f5798a402c40524f63ed4391bf96f4737fa9c1c3b85a3eb033165
3
  size 14244
run-1/checkpoint-3500/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:0d1eacb7d0907646116f46e587547d539021a986bc4e71dd2131412a1e053881
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e15c4a9d4ccbbbe8f000aafc73dbb152ac59304f61975211eceaa3dabed117db
3
  size 1064
run-1/checkpoint-3500/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:0d4b8161881416da68cca372d0db6cfca426bea24c4463a15d5c25fafd1f2c02
3
  size 5048
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cb93c6c6aec435a71a91d9e9d85d4dd9cc0b1363951552ae02212d34fb657545
3
  size 5048
run-2/checkpoint-1000/trainer_state.json CHANGED
@@ -1,7 +1,7 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 1.3404825737265416,
5
  "eval_steps": 500,
6
  "global_step": 1000,
7
  "is_hyper_param_search": true,
@@ -9,39 +9,48 @@
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
- "epoch": 0.6702412868632708,
13
- "grad_norm": 17.923147201538086,
14
- "learning_rate": 1.3001640168623715e-05,
15
- "loss": 0.5514,
 
 
 
 
 
 
 
 
 
16
  "step": 500
17
  },
18
  {
19
- "epoch": 1.0,
20
- "eval_accuracy": 0.7683539986610413,
21
- "eval_loss": 0.46452510356903076,
22
- "eval_runtime": 8.9706,
23
- "eval_samples_per_second": 332.531,
24
- "eval_steps_per_second": 20.846,
25
  "step": 746
26
  },
27
  {
28
- "epoch": 1.3404825737265416,
29
- "grad_norm": 19.32151222229004,
30
- "learning_rate": 1.2067614294440976e-05,
31
- "loss": 0.3969,
32
  "step": 1000
33
  }
34
  ],
35
  "logging_steps": 500,
36
- "max_steps": 7460,
37
  "num_input_tokens_seen": 0,
38
  "num_train_epochs": 10,
39
  "save_steps": 500,
40
- "total_flos": 2375543264345280.0,
41
- "train_batch_size": 16,
42
  "trial_name": null,
43
  "trial_params": {
44
- "learning_rate": 1.3935666042806453e-05,
45
- "per_device_train_batch_size": 16
46
  }
47
  }
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 2.680965147453083,
5
  "eval_steps": 500,
6
  "global_step": 1000,
7
  "is_hyper_param_search": true,
 
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
+ "epoch": 1.0,
13
+ "eval_accuracy": 0.7525980472564697,
14
+ "eval_loss": 0.4786739945411682,
15
+ "eval_runtime": 8.9408,
16
+ "eval_samples_per_second": 333.638,
17
+ "eval_steps_per_second": 20.915,
18
+ "step": 373
19
+ },
20
+ {
21
+ "epoch": 1.3404825737265416,
22
+ "grad_norm": 76.13782501220703,
23
+ "learning_rate": 1.6728270896334427e-05,
24
+ "loss": 0.4795,
25
  "step": 500
26
  },
27
  {
28
+ "epoch": 2.0,
29
+ "eval_accuracy": 0.7680187821388245,
30
+ "eval_loss": 0.5211130976676941,
31
+ "eval_runtime": 8.9873,
32
+ "eval_samples_per_second": 331.913,
33
+ "eval_steps_per_second": 20.807,
34
  "step": 746
35
  },
36
  {
37
+ "epoch": 2.680965147453083,
38
+ "grad_norm": 7.740549087524414,
39
+ "learning_rate": 1.4138755277706807e-05,
40
+ "loss": 0.2127,
41
  "step": 1000
42
  }
43
  ],
44
  "logging_steps": 500,
45
+ "max_steps": 3730,
46
  "num_input_tokens_seen": 0,
47
  "num_train_epochs": 10,
48
  "save_steps": 500,
49
+ "total_flos": 5098033209994488.0,
50
+ "train_batch_size": 32,
51
  "trial_name": null,
52
  "trial_params": {
53
+ "learning_rate": 1.9317786514962047e-05,
54
+ "per_device_train_batch_size": 32
55
  }
56
  }
run-2/checkpoint-1500/trainer_state.json CHANGED
@@ -1,7 +1,7 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 2.0107238605898123,
5
  "eval_steps": 500,
6
  "global_step": 1500,
7
  "is_hyper_param_search": true,
@@ -9,55 +9,73 @@
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
- "epoch": 0.6702412868632708,
13
- "grad_norm": 17.923147201538086,
14
- "learning_rate": 1.3001640168623715e-05,
15
- "loss": 0.5514,
 
 
 
 
 
 
 
 
 
16
  "step": 500
17
  },
18
  {
19
- "epoch": 1.0,
20
- "eval_accuracy": 0.7683539986610413,
21
- "eval_loss": 0.46452510356903076,
22
- "eval_runtime": 8.9706,
23
- "eval_samples_per_second": 332.531,
24
- "eval_steps_per_second": 20.846,
25
  "step": 746
26
  },
27
  {
28
- "epoch": 1.3404825737265416,
29
- "grad_norm": 19.32151222229004,
30
- "learning_rate": 1.2067614294440976e-05,
31
- "loss": 0.3969,
32
  "step": 1000
33
  },
34
  {
35
- "epoch": 2.0,
36
- "eval_accuracy": 0.7643312215805054,
37
- "eval_loss": 0.5671606063842773,
38
- "eval_runtime": 9.0896,
39
- "eval_samples_per_second": 328.177,
40
- "eval_steps_per_second": 20.573,
 
 
 
 
 
 
 
 
 
41
  "step": 1492
42
  },
43
  {
44
- "epoch": 2.0107238605898123,
45
- "grad_norm": 17.702476501464844,
46
- "learning_rate": 1.1133588420258238e-05,
47
- "loss": 0.2998,
48
  "step": 1500
49
  }
50
  ],
51
  "logging_steps": 500,
52
- "max_steps": 7460,
53
  "num_input_tokens_seen": 0,
54
  "num_train_epochs": 10,
55
  "save_steps": 500,
56
- "total_flos": 3564872964001584.0,
57
- "train_batch_size": 16,
58
  "trial_name": null,
59
  "trial_params": {
60
- "learning_rate": 1.3935666042806453e-05,
61
- "per_device_train_batch_size": 16
62
  }
63
  }
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 4.021447721179625,
5
  "eval_steps": 500,
6
  "global_step": 1500,
7
  "is_hyper_param_search": true,
 
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
+ "epoch": 1.0,
13
+ "eval_accuracy": 0.7525980472564697,
14
+ "eval_loss": 0.4786739945411682,
15
+ "eval_runtime": 8.9408,
16
+ "eval_samples_per_second": 333.638,
17
+ "eval_steps_per_second": 20.915,
18
+ "step": 373
19
+ },
20
+ {
21
+ "epoch": 1.3404825737265416,
22
+ "grad_norm": 76.13782501220703,
23
+ "learning_rate": 1.6728270896334427e-05,
24
+ "loss": 0.4795,
25
  "step": 500
26
  },
27
  {
28
+ "epoch": 2.0,
29
+ "eval_accuracy": 0.7680187821388245,
30
+ "eval_loss": 0.5211130976676941,
31
+ "eval_runtime": 8.9873,
32
+ "eval_samples_per_second": 331.913,
33
+ "eval_steps_per_second": 20.807,
34
  "step": 746
35
  },
36
  {
37
+ "epoch": 2.680965147453083,
38
+ "grad_norm": 7.740549087524414,
39
+ "learning_rate": 1.4138755277706807e-05,
40
+ "loss": 0.2127,
41
  "step": 1000
42
  },
43
  {
44
+ "epoch": 3.0,
45
+ "eval_accuracy": 0.7683539986610413,
46
+ "eval_loss": 0.8232186436653137,
47
+ "eval_runtime": 8.9717,
48
+ "eval_samples_per_second": 332.49,
49
+ "eval_steps_per_second": 20.843,
50
+ "step": 1119
51
+ },
52
+ {
53
+ "epoch": 4.0,
54
+ "eval_accuracy": 0.7660073637962341,
55
+ "eval_loss": 1.132562518119812,
56
+ "eval_runtime": 9.1516,
57
+ "eval_samples_per_second": 325.954,
58
+ "eval_steps_per_second": 20.434,
59
  "step": 1492
60
  },
61
  {
62
+ "epoch": 4.021447721179625,
63
+ "grad_norm": 2.1688199043273926,
64
+ "learning_rate": 1.1549239659079185e-05,
65
+ "loss": 0.0802,
66
  "step": 1500
67
  }
68
  ],
69
  "logging_steps": 500,
70
+ "max_steps": 3730,
71
  "num_input_tokens_seen": 0,
72
  "num_train_epochs": 10,
73
  "save_steps": 500,
74
+ "total_flos": 7632131682870108.0,
75
+ "train_batch_size": 32,
76
  "trial_name": null,
77
  "trial_params": {
78
+ "learning_rate": 1.9317786514962047e-05,
79
+ "per_device_train_batch_size": 32
80
  }
81
  }
run-2/checkpoint-500/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c04ced4e95104bc5ec0123541b73b7f8b1f5cb81b734fc7031750ea9106c9345
3
  size 1340618660
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5bccd83274c04304a07fc0d7ab77b03e53a90770efa631006c5e6779abee5946
3
  size 1340618660
run-2/checkpoint-500/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:63b024ec96d986f6b82a3d0435da50c96ff74ecb7dbbbf638e5d2cd56c065a36
3
  size 2681472237
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f887138e43828f62cfed1e28eb6aa919a2144b02d9d355c7ebeb0c66cac4fc43
3
  size 2681472237
run-2/checkpoint-500/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f544045d8bc120b0bef3c491fba9f1ed6efda96a8fe519bf19d9f17a0a9934ac
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:268959cd06affa51103f53408d4fbfdbd757b4027cad9ad299a004dbe705b408
3
  size 14244
run-2/checkpoint-500/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:2d591d496bef394ec33ea58d896f0531283796cc8c8253f2381e962a88b5f9e0
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d319558b9ee6e8ba623e2291b50f3a70dbc201ed081b709c0215db617558c2da
3
  size 1064
run-2/checkpoint-500/trainer_state.json CHANGED
@@ -1,7 +1,7 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 0.6702412868632708,
5
  "eval_steps": 500,
6
  "global_step": 500,
7
  "is_hyper_param_search": true,
@@ -9,23 +9,32 @@
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
- "epoch": 0.6702412868632708,
13
- "grad_norm": 17.923147201538086,
14
- "learning_rate": 1.3001640168623715e-05,
15
- "loss": 0.5514,
 
 
 
 
 
 
 
 
 
16
  "step": 500
17
  }
18
  ],
19
  "logging_steps": 500,
20
- "max_steps": 7460,
21
  "num_input_tokens_seen": 0,
22
  "num_train_epochs": 10,
23
  "save_steps": 500,
24
- "total_flos": 1189489874818176.0,
25
- "train_batch_size": 16,
26
  "trial_name": null,
27
  "trial_params": {
28
- "learning_rate": 1.3935666042806453e-05,
29
- "per_device_train_batch_size": 16
30
  }
31
  }
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 1.3404825737265416,
5
  "eval_steps": 500,
6
  "global_step": 500,
7
  "is_hyper_param_search": true,
 
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
+ "epoch": 1.0,
13
+ "eval_accuracy": 0.7525980472564697,
14
+ "eval_loss": 0.4786739945411682,
15
+ "eval_runtime": 8.9408,
16
+ "eval_samples_per_second": 333.638,
17
+ "eval_steps_per_second": 20.915,
18
+ "step": 373
19
+ },
20
+ {
21
+ "epoch": 1.3404825737265416,
22
+ "grad_norm": 76.13782501220703,
23
+ "learning_rate": 1.6728270896334427e-05,
24
+ "loss": 0.4795,
25
  "step": 500
26
  }
27
  ],
28
  "logging_steps": 500,
29
+ "max_steps": 3730,
30
  "num_input_tokens_seen": 0,
31
  "num_train_epochs": 10,
32
  "save_steps": 500,
33
+ "total_flos": 2552813484402528.0,
34
+ "train_batch_size": 32,
35
  "trial_name": null,
36
  "trial_params": {
37
+ "learning_rate": 1.9317786514962047e-05,
38
+ "per_device_train_batch_size": 32
39
  }
40
  }
run-2/checkpoint-500/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c68cfe34dae325096495265befbc04045f52f535badd4f779e9c272d3d0b0f80
3
  size 5048
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2ed49fe002187d5ffed6d819c5938a827ebb9152529606096a657c65a1141ceb
3
  size 5048
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:cb93c6c6aec435a71a91d9e9d85d4dd9cc0b1363951552ae02212d34fb657545
3
  size 5048
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2ed49fe002187d5ffed6d819c5938a827ebb9152529606096a657c65a1141ceb
3
  size 5048