inflaton commited on
Commit
b8f963c
1 Parent(s): e83a66f

Training in progress, step 500

Browse files
model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:5b23bd5bcb6c3f20acbd47f46114a67d639b650403922c7d1cf092f251113025
3
  size 1340618660
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:29808a477e6dfe819802c452b4eb8059a040db301b42679e25eb9cd4177ce709
3
  size 1340618660
run-3/checkpoint-1000/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:4d9649ba9e265d8e4fe5d708f3e7adb9d6191f9844715240ec35d509ec8e5fe6
3
  size 1340618660
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f21ff9e01d89b51eac90f2eeae545d2208d13b78f82cd701436e3013992c9c0e
3
  size 1340618660
run-3/checkpoint-1000/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b6ef50bc2eb08bb3dbb1e2f70d0fdf07d835c95837a40f398f4436b48b906838
3
  size 2681472237
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4074d2e16647b008b1fa854a9a0bb1e2ebe2c4168cc335c401b5cb7b7032366e
3
  size 2681472237
run-3/checkpoint-1000/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f15ef18f77678b087c8d8d016723a03f0d0f4e837051c67af83ea5404cfb3e09
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:38c752dc0df82afd3fc4927225721c6718f89b642523e4400a9217d164bd5d92
3
  size 14244
run-3/checkpoint-1000/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f2e778733aea3d76f79d700f7557d56cc34f9c1df8dc23634cecfda952769b78
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b113482378fbd43976c3c756f387e018112046d66480e853eb7f9eecc00366b7
3
  size 1064
run-3/checkpoint-1000/trainer_state.json CHANGED
@@ -1,7 +1,7 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 2.680965147453083,
5
  "eval_steps": 500,
6
  "global_step": 1000,
7
  "is_hyper_param_search": true,
@@ -9,48 +9,39 @@
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
- "epoch": 1.0,
13
- "eval_accuracy": 0.7727120518684387,
14
- "eval_loss": 0.46057018637657166,
15
- "eval_runtime": 8.9113,
16
- "eval_samples_per_second": 334.743,
17
- "eval_steps_per_second": 20.985,
18
- "step": 373
19
- },
20
- {
21
- "epoch": 1.3404825737265416,
22
- "grad_norm": 8.923430442810059,
23
- "learning_rate": 2.587666867882196e-05,
24
- "loss": 0.4651,
25
  "step": 500
26
  },
27
  {
28
- "epoch": 2.0,
29
- "eval_accuracy": 0.7770700454711914,
30
- "eval_loss": 0.5375419855117798,
31
- "eval_runtime": 9.0162,
32
- "eval_samples_per_second": 330.848,
33
- "eval_steps_per_second": 20.74,
34
  "step": 746
35
  },
36
  {
37
- "epoch": 2.680965147453083,
38
- "grad_norm": 30.215883255004883,
39
- "learning_rate": 2.187099241275045e-05,
40
- "loss": 0.176,
41
  "step": 1000
42
  }
43
  ],
44
  "logging_steps": 500,
45
- "max_steps": 3730,
46
  "num_input_tokens_seen": 0,
47
  "num_train_epochs": 10,
48
  "save_steps": 500,
49
- "total_flos": 5069168917756236.0,
50
- "train_batch_size": 32,
51
  "trial_name": null,
52
  "trial_params": {
53
- "learning_rate": 2.988234494489347e-05,
54
- "per_device_train_batch_size": 32
55
  }
56
  }
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 1.3404825737265416,
5
  "eval_steps": 500,
6
  "global_step": 1000,
7
  "is_hyper_param_search": true,
 
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
+ "epoch": 0.6702412868632708,
13
+ "grad_norm": 5.694277763366699,
14
+ "learning_rate": 2.542005392773407e-05,
15
+ "loss": 0.557,
 
 
 
 
 
 
 
 
 
16
  "step": 500
17
  },
18
  {
19
+ "epoch": 1.0,
20
+ "eval_accuracy": 0.7254441976547241,
21
+ "eval_loss": 0.5171247720718384,
22
+ "eval_runtime": 8.773,
23
+ "eval_samples_per_second": 340.02,
24
+ "eval_steps_per_second": 21.315,
25
  "step": 746
26
  },
27
  {
28
+ "epoch": 1.3404825737265416,
29
+ "grad_norm": 8.974740028381348,
30
+ "learning_rate": 2.359390062832789e-05,
31
+ "loss": 0.4156,
32
  "step": 1000
33
  }
34
  ],
35
  "logging_steps": 500,
36
+ "max_steps": 7460,
37
  "num_input_tokens_seen": 0,
38
  "num_train_epochs": 10,
39
  "save_steps": 500,
40
+ "total_flos": 2380566939876720.0,
41
+ "train_batch_size": 16,
42
  "trial_name": null,
43
  "trial_params": {
44
+ "learning_rate": 2.7246207227140256e-05,
45
+ "per_device_train_batch_size": 16
46
  }
47
  }
run-3/checkpoint-1000/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e7873f643bbe0bcde81dd97f76bd7f35cc2bc225794540a61053f0708234413f
3
  size 5048
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1507950b2d37b737502f824dab70976a7fa7a07f6887612e84989d3ab0cc54db
3
  size 5048
run-3/checkpoint-1500/trainer_state.json CHANGED
@@ -1,7 +1,7 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 4.021447721179625,
5
  "eval_steps": 500,
6
  "global_step": 1500,
7
  "is_hyper_param_search": true,
@@ -9,73 +9,55 @@
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
- "epoch": 1.0,
13
- "eval_accuracy": 0.7727120518684387,
14
- "eval_loss": 0.46057018637657166,
15
- "eval_runtime": 8.9113,
16
- "eval_samples_per_second": 334.743,
17
- "eval_steps_per_second": 20.985,
18
- "step": 373
19
- },
20
- {
21
- "epoch": 1.3404825737265416,
22
- "grad_norm": 8.923430442810059,
23
- "learning_rate": 2.587666867882196e-05,
24
- "loss": 0.4651,
25
  "step": 500
26
  },
27
  {
28
- "epoch": 2.0,
29
- "eval_accuracy": 0.7770700454711914,
30
- "eval_loss": 0.5375419855117798,
31
- "eval_runtime": 9.0162,
32
- "eval_samples_per_second": 330.848,
33
- "eval_steps_per_second": 20.74,
34
  "step": 746
35
  },
36
  {
37
- "epoch": 2.680965147453083,
38
- "grad_norm": 30.215883255004883,
39
- "learning_rate": 2.187099241275045e-05,
40
- "loss": 0.176,
41
  "step": 1000
42
  },
43
  {
44
- "epoch": 3.0,
45
- "eval_accuracy": 0.7777405381202698,
46
- "eval_loss": 0.948959469795227,
47
- "eval_runtime": 9.007,
48
- "eval_samples_per_second": 331.187,
49
- "eval_steps_per_second": 20.762,
50
- "step": 1119
51
- },
52
- {
53
- "epoch": 4.0,
54
- "eval_accuracy": 0.7760643362998962,
55
- "eval_loss": 1.1204984188079834,
56
- "eval_runtime": 9.0575,
57
- "eval_samples_per_second": 329.342,
58
- "eval_steps_per_second": 20.646,
59
  "step": 1492
60
  },
61
  {
62
- "epoch": 4.021447721179625,
63
- "grad_norm": 0.16835728287696838,
64
- "learning_rate": 1.7865316146678937e-05,
65
- "loss": 0.0608,
66
  "step": 1500
67
  }
68
  ],
69
  "logging_steps": 500,
70
- "max_steps": 3730,
71
  "num_input_tokens_seen": 0,
72
  "num_train_epochs": 10,
73
  "save_steps": 500,
74
- "total_flos": 7610944877367948.0,
75
- "train_batch_size": 32,
76
  "trial_name": null,
77
  "trial_params": {
78
- "learning_rate": 2.988234494489347e-05,
79
- "per_device_train_batch_size": 32
80
  }
81
  }
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 2.0107238605898123,
5
  "eval_steps": 500,
6
  "global_step": 1500,
7
  "is_hyper_param_search": true,
 
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
+ "epoch": 0.6702412868632708,
13
+ "grad_norm": 5.694277763366699,
14
+ "learning_rate": 2.542005392773407e-05,
15
+ "loss": 0.557,
 
 
 
 
 
 
 
 
 
16
  "step": 500
17
  },
18
  {
19
+ "epoch": 1.0,
20
+ "eval_accuracy": 0.7254441976547241,
21
+ "eval_loss": 0.5171247720718384,
22
+ "eval_runtime": 8.773,
23
+ "eval_samples_per_second": 340.02,
24
+ "eval_steps_per_second": 21.315,
25
  "step": 746
26
  },
27
  {
28
+ "epoch": 1.3404825737265416,
29
+ "grad_norm": 8.974740028381348,
30
+ "learning_rate": 2.359390062832789e-05,
31
+ "loss": 0.4156,
32
  "step": 1000
33
  },
34
  {
35
+ "epoch": 2.0,
36
+ "eval_accuracy": 0.7596379518508911,
37
+ "eval_loss": 0.6025224924087524,
38
+ "eval_runtime": 8.8883,
39
+ "eval_samples_per_second": 335.609,
40
+ "eval_steps_per_second": 21.039,
 
 
 
 
 
 
 
 
 
41
  "step": 1492
42
  },
43
  {
44
+ "epoch": 2.0107238605898123,
45
+ "grad_norm": 7.7003068923950195,
46
+ "learning_rate": 2.1767747328921705e-05,
47
+ "loss": 0.2948,
48
  "step": 1500
49
  }
50
  ],
51
  "logging_steps": 500,
52
+ "max_steps": 7460,
53
  "num_input_tokens_seen": 0,
54
  "num_train_epochs": 10,
55
  "save_steps": 500,
56
+ "total_flos": 3566922478004628.0,
57
+ "train_batch_size": 16,
58
  "trial_name": null,
59
  "trial_params": {
60
+ "learning_rate": 2.7246207227140256e-05,
61
+ "per_device_train_batch_size": 16
62
  }
63
  }
run-3/checkpoint-2000/trainer_state.json CHANGED
@@ -1,7 +1,7 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 5.361930294906166,
5
  "eval_steps": 500,
6
  "global_step": 2000,
7
  "is_hyper_param_search": true,
@@ -9,89 +9,62 @@
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
- "epoch": 1.0,
13
- "eval_accuracy": 0.7727120518684387,
14
- "eval_loss": 0.46057018637657166,
15
- "eval_runtime": 8.9113,
16
- "eval_samples_per_second": 334.743,
17
- "eval_steps_per_second": 20.985,
18
- "step": 373
19
- },
20
- {
21
- "epoch": 1.3404825737265416,
22
- "grad_norm": 8.923430442810059,
23
- "learning_rate": 2.587666867882196e-05,
24
- "loss": 0.4651,
25
  "step": 500
26
  },
27
  {
28
- "epoch": 2.0,
29
- "eval_accuracy": 0.7770700454711914,
30
- "eval_loss": 0.5375419855117798,
31
- "eval_runtime": 9.0162,
32
- "eval_samples_per_second": 330.848,
33
- "eval_steps_per_second": 20.74,
34
  "step": 746
35
  },
36
  {
37
- "epoch": 2.680965147453083,
38
- "grad_norm": 30.215883255004883,
39
- "learning_rate": 2.187099241275045e-05,
40
- "loss": 0.176,
41
  "step": 1000
42
  },
43
  {
44
- "epoch": 3.0,
45
- "eval_accuracy": 0.7777405381202698,
46
- "eval_loss": 0.948959469795227,
47
- "eval_runtime": 9.007,
48
- "eval_samples_per_second": 331.187,
49
- "eval_steps_per_second": 20.762,
50
- "step": 1119
51
- },
52
- {
53
- "epoch": 4.0,
54
- "eval_accuracy": 0.7760643362998962,
55
- "eval_loss": 1.1204984188079834,
56
- "eval_runtime": 9.0575,
57
- "eval_samples_per_second": 329.342,
58
- "eval_steps_per_second": 20.646,
59
  "step": 1492
60
  },
61
  {
62
- "epoch": 4.021447721179625,
63
- "grad_norm": 0.16835728287696838,
64
- "learning_rate": 1.7865316146678937e-05,
65
- "loss": 0.0608,
66
  "step": 1500
67
  },
68
  {
69
- "epoch": 5.0,
70
- "eval_accuracy": 0.7787462472915649,
71
- "eval_loss": 1.4213643074035645,
72
- "eval_runtime": 9.0611,
73
- "eval_samples_per_second": 329.211,
74
- "eval_steps_per_second": 20.638,
75
- "step": 1865
76
- },
77
- {
78
- "epoch": 5.361930294906166,
79
- "grad_norm": 0.16181084513664246,
80
- "learning_rate": 1.3859639880607426e-05,
81
- "loss": 0.0242,
82
  "step": 2000
83
  }
84
  ],
85
  "logging_steps": 500,
86
- "max_steps": 3730,
87
  "num_input_tokens_seen": 0,
88
  "num_train_epochs": 10,
89
  "save_steps": 500,
90
- "total_flos": 1.01458223839854e+16,
91
- "train_batch_size": 32,
92
  "trial_name": null,
93
  "trial_params": {
94
- "learning_rate": 2.988234494489347e-05,
95
- "per_device_train_batch_size": 32
96
  }
97
  }
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 2.680965147453083,
5
  "eval_steps": 500,
6
  "global_step": 2000,
7
  "is_hyper_param_search": true,
 
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
+ "epoch": 0.6702412868632708,
13
+ "grad_norm": 5.694277763366699,
14
+ "learning_rate": 2.542005392773407e-05,
15
+ "loss": 0.557,
 
 
 
 
 
 
 
 
 
16
  "step": 500
17
  },
18
  {
19
+ "epoch": 1.0,
20
+ "eval_accuracy": 0.7254441976547241,
21
+ "eval_loss": 0.5171247720718384,
22
+ "eval_runtime": 8.773,
23
+ "eval_samples_per_second": 340.02,
24
+ "eval_steps_per_second": 21.315,
25
  "step": 746
26
  },
27
  {
28
+ "epoch": 1.3404825737265416,
29
+ "grad_norm": 8.974740028381348,
30
+ "learning_rate": 2.359390062832789e-05,
31
+ "loss": 0.4156,
32
  "step": 1000
33
  },
34
  {
35
+ "epoch": 2.0,
36
+ "eval_accuracy": 0.7596379518508911,
37
+ "eval_loss": 0.6025224924087524,
38
+ "eval_runtime": 8.8883,
39
+ "eval_samples_per_second": 335.609,
40
+ "eval_steps_per_second": 21.039,
 
 
 
 
 
 
 
 
 
41
  "step": 1492
42
  },
43
  {
44
+ "epoch": 2.0107238605898123,
45
+ "grad_norm": 7.7003068923950195,
46
+ "learning_rate": 2.1767747328921705e-05,
47
+ "loss": 0.2948,
48
  "step": 1500
49
  },
50
  {
51
+ "epoch": 2.680965147453083,
52
+ "grad_norm": 20.24570655822754,
53
+ "learning_rate": 1.9941594029515523e-05,
54
+ "loss": 0.1262,
 
 
 
 
 
 
 
 
 
55
  "step": 2000
56
  }
57
  ],
58
  "logging_steps": 500,
59
+ "max_steps": 7460,
60
  "num_input_tokens_seen": 0,
61
  "num_train_epochs": 10,
62
  "save_steps": 500,
63
+ "total_flos": 4761479712489300.0,
64
+ "train_batch_size": 16,
65
  "trial_name": null,
66
  "trial_params": {
67
+ "learning_rate": 2.7246207227140256e-05,
68
+ "per_device_train_batch_size": 16
69
  }
70
  }
run-3/checkpoint-2500/trainer_state.json CHANGED
@@ -1,7 +1,7 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 6.702412868632708,
5
  "eval_steps": 500,
6
  "global_step": 2500,
7
  "is_hyper_param_search": true,
@@ -9,105 +9,78 @@
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
- "epoch": 1.0,
13
- "eval_accuracy": 0.7727120518684387,
14
- "eval_loss": 0.46057018637657166,
15
- "eval_runtime": 8.9113,
16
- "eval_samples_per_second": 334.743,
17
- "eval_steps_per_second": 20.985,
18
- "step": 373
19
- },
20
- {
21
- "epoch": 1.3404825737265416,
22
- "grad_norm": 8.923430442810059,
23
- "learning_rate": 2.587666867882196e-05,
24
- "loss": 0.4651,
25
  "step": 500
26
  },
27
  {
28
- "epoch": 2.0,
29
- "eval_accuracy": 0.7770700454711914,
30
- "eval_loss": 0.5375419855117798,
31
- "eval_runtime": 9.0162,
32
- "eval_samples_per_second": 330.848,
33
- "eval_steps_per_second": 20.74,
34
  "step": 746
35
  },
36
  {
37
- "epoch": 2.680965147453083,
38
- "grad_norm": 30.215883255004883,
39
- "learning_rate": 2.187099241275045e-05,
40
- "loss": 0.176,
41
  "step": 1000
42
  },
43
  {
44
- "epoch": 3.0,
45
- "eval_accuracy": 0.7777405381202698,
46
- "eval_loss": 0.948959469795227,
47
- "eval_runtime": 9.007,
48
- "eval_samples_per_second": 331.187,
49
- "eval_steps_per_second": 20.762,
50
- "step": 1119
51
- },
52
- {
53
- "epoch": 4.0,
54
- "eval_accuracy": 0.7760643362998962,
55
- "eval_loss": 1.1204984188079834,
56
- "eval_runtime": 9.0575,
57
- "eval_samples_per_second": 329.342,
58
- "eval_steps_per_second": 20.646,
59
  "step": 1492
60
  },
61
  {
62
- "epoch": 4.021447721179625,
63
- "grad_norm": 0.16835728287696838,
64
- "learning_rate": 1.7865316146678937e-05,
65
- "loss": 0.0608,
66
  "step": 1500
67
  },
68
  {
69
- "epoch": 5.0,
70
- "eval_accuracy": 0.7787462472915649,
71
- "eval_loss": 1.4213643074035645,
72
- "eval_runtime": 9.0611,
73
- "eval_samples_per_second": 329.211,
74
- "eval_steps_per_second": 20.638,
75
- "step": 1865
76
- },
77
- {
78
- "epoch": 5.361930294906166,
79
- "grad_norm": 0.16181084513664246,
80
- "learning_rate": 1.3859639880607426e-05,
81
- "loss": 0.0242,
82
  "step": 2000
83
  },
84
  {
85
- "epoch": 6.0,
86
- "eval_accuracy": 0.7784109711647034,
87
- "eval_loss": 1.251684308052063,
88
- "eval_runtime": 9.0244,
89
- "eval_samples_per_second": 330.548,
90
- "eval_steps_per_second": 20.722,
91
  "step": 2238
92
  },
93
  {
94
- "epoch": 6.702412868632708,
95
- "grad_norm": 2.160583734512329,
96
- "learning_rate": 9.853963614535916e-06,
97
- "loss": 0.0136,
98
  "step": 2500
99
  }
100
  ],
101
  "logging_steps": 500,
102
- "max_steps": 3730,
103
  "num_input_tokens_seen": 0,
104
  "num_train_epochs": 10,
105
  "save_steps": 500,
106
- "total_flos": 1.2690172067220828e+16,
107
- "train_batch_size": 32,
108
  "trial_name": null,
109
  "trial_params": {
110
- "learning_rate": 2.988234494489347e-05,
111
- "per_device_train_batch_size": 32
112
  }
113
  }
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 3.351206434316354,
5
  "eval_steps": 500,
6
  "global_step": 2500,
7
  "is_hyper_param_search": true,
 
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
+ "epoch": 0.6702412868632708,
13
+ "grad_norm": 5.694277763366699,
14
+ "learning_rate": 2.542005392773407e-05,
15
+ "loss": 0.557,
 
 
 
 
 
 
 
 
 
16
  "step": 500
17
  },
18
  {
19
+ "epoch": 1.0,
20
+ "eval_accuracy": 0.7254441976547241,
21
+ "eval_loss": 0.5171247720718384,
22
+ "eval_runtime": 8.773,
23
+ "eval_samples_per_second": 340.02,
24
+ "eval_steps_per_second": 21.315,
25
  "step": 746
26
  },
27
  {
28
+ "epoch": 1.3404825737265416,
29
+ "grad_norm": 8.974740028381348,
30
+ "learning_rate": 2.359390062832789e-05,
31
+ "loss": 0.4156,
32
  "step": 1000
33
  },
34
  {
35
+ "epoch": 2.0,
36
+ "eval_accuracy": 0.7596379518508911,
37
+ "eval_loss": 0.6025224924087524,
38
+ "eval_runtime": 8.8883,
39
+ "eval_samples_per_second": 335.609,
40
+ "eval_steps_per_second": 21.039,
 
 
 
 
 
 
 
 
 
41
  "step": 1492
42
  },
43
  {
44
+ "epoch": 2.0107238605898123,
45
+ "grad_norm": 7.7003068923950195,
46
+ "learning_rate": 2.1767747328921705e-05,
47
+ "loss": 0.2948,
48
  "step": 1500
49
  },
50
  {
51
+ "epoch": 2.680965147453083,
52
+ "grad_norm": 20.24570655822754,
53
+ "learning_rate": 1.9941594029515523e-05,
54
+ "loss": 0.1262,
 
 
 
 
 
 
 
 
 
55
  "step": 2000
56
  },
57
  {
58
+ "epoch": 3.0,
59
+ "eval_accuracy": 0.7703654170036316,
60
+ "eval_loss": 0.822274386882782,
61
+ "eval_runtime": 8.8709,
62
+ "eval_samples_per_second": 336.267,
63
+ "eval_steps_per_second": 21.08,
64
  "step": 2238
65
  },
66
  {
67
+ "epoch": 3.351206434316354,
68
+ "grad_norm": 0.9093023538589478,
69
+ "learning_rate": 1.8115440730109338e-05,
70
+ "loss": 0.1012,
71
  "step": 2500
72
  }
73
  ],
74
  "logging_steps": 500,
75
+ "max_steps": 7460,
76
  "num_input_tokens_seen": 0,
77
  "num_train_epochs": 10,
78
  "save_steps": 500,
79
+ "total_flos": 5940423509036040.0,
80
+ "train_batch_size": 16,
81
  "trial_name": null,
82
  "trial_params": {
83
+ "learning_rate": 2.7246207227140256e-05,
84
+ "per_device_train_batch_size": 16
85
  }
86
  }
run-3/checkpoint-500/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:be3fbf39b8eb79e3df9922f6763aa922164fc565b8a550581ae148e0984a5a12
3
  size 1340618660
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:29808a477e6dfe819802c452b4eb8059a040db301b42679e25eb9cd4177ce709
3
  size 1340618660
run-3/checkpoint-500/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:9539a7d53917a01fe4b419de93225226e85c3f4a9a0df5577320ae94ffe12d58
3
  size 2681472237
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e8fc0469f5aa6f117cbacf1d4a26783711bd3e55943b29bbd4da393ae27bfe75
3
  size 2681472237
run-3/checkpoint-500/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d9bbf563591c5fe3fe29dd3bd70b2dd79355243b58fe1d041144c9bfbba0d18a
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b39308b250b2dde21cca6217a709d5456bcdcab3c796c6926f25c06b9c730de1
3
  size 14244
run-3/checkpoint-500/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:dde29e8f26af104e75c5a360f6999fec6e726777ec360c3cbb8f5a9d140b76f1
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2a9c56c5416529238c5208f6b778cd50fb2a5e3ae44a1cc288f1cb37408eb7ee
3
  size 1064
run-3/checkpoint-500/trainer_state.json CHANGED
@@ -1,7 +1,7 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 1.3404825737265416,
5
  "eval_steps": 500,
6
  "global_step": 500,
7
  "is_hyper_param_search": true,
@@ -9,32 +9,23 @@
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
- "epoch": 1.0,
13
- "eval_accuracy": 0.7727120518684387,
14
- "eval_loss": 0.46057018637657166,
15
- "eval_runtime": 8.9113,
16
- "eval_samples_per_second": 334.743,
17
- "eval_steps_per_second": 20.985,
18
- "step": 373
19
- },
20
- {
21
- "epoch": 1.3404825737265416,
22
- "grad_norm": 8.923430442810059,
23
- "learning_rate": 2.587666867882196e-05,
24
- "loss": 0.4651,
25
  "step": 500
26
  }
27
  ],
28
  "logging_steps": 500,
29
- "max_steps": 3730,
30
  "num_input_tokens_seen": 0,
31
  "num_train_epochs": 10,
32
  "save_steps": 500,
33
- "total_flos": 2540028594209472.0,
34
- "train_batch_size": 32,
35
  "trial_name": null,
36
  "trial_params": {
37
- "learning_rate": 2.988234494489347e-05,
38
- "per_device_train_batch_size": 32
39
  }
40
  }
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 0.6702412868632708,
5
  "eval_steps": 500,
6
  "global_step": 500,
7
  "is_hyper_param_search": true,
 
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
+ "epoch": 0.6702412868632708,
13
+ "grad_norm": 5.694277763366699,
14
+ "learning_rate": 2.542005392773407e-05,
15
+ "loss": 0.557,
 
 
 
 
 
 
 
 
 
16
  "step": 500
17
  }
18
  ],
19
  "logging_steps": 500,
20
+ "max_steps": 7460,
21
  "num_input_tokens_seen": 0,
22
  "num_train_epochs": 10,
23
  "save_steps": 500,
24
+ "total_flos": 1185529179906432.0,
25
+ "train_batch_size": 16,
26
  "trial_name": null,
27
  "trial_params": {
28
+ "learning_rate": 2.7246207227140256e-05,
29
+ "per_device_train_batch_size": 16
30
  }
31
  }
run-3/checkpoint-500/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e7873f643bbe0bcde81dd97f76bd7f35cc2bc225794540a61053f0708234413f
3
  size 5048
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1507950b2d37b737502f824dab70976a7fa7a07f6887612e84989d3ab0cc54db
3
  size 5048
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:2ed49fe002187d5ffed6d819c5938a827ebb9152529606096a657c65a1141ceb
3
  size 5048
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1507950b2d37b737502f824dab70976a7fa7a07f6887612e84989d3ab0cc54db
3
  size 5048