inflaton commited on
Commit
842e814
1 Parent(s): 0f2cb6d

Training in progress, step 500

Browse files
model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:842fa676b513d687d5c7365813933e30a841909c1a9d95ed3881ee9900e0bd88
3
  size 1340618660
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:38879bf812f2ea865a112a9cd8fdd63fcd1d9e3b5480262ad99fdeb8669aebbb
3
  size 1340618660
run-0/checkpoint-1000/trainer_state.json CHANGED
@@ -1,7 +1,7 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 2.680965147453083,
5
  "eval_steps": 500,
6
  "global_step": 1000,
7
  "is_hyper_param_search": true,
@@ -9,48 +9,39 @@
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
- "epoch": 1.0,
13
- "eval_accuracy": 0.7576265335083008,
14
- "eval_loss": 0.4990231990814209,
15
- "eval_runtime": 8.8755,
16
- "eval_samples_per_second": 336.094,
17
- "eval_steps_per_second": 21.069,
18
- "step": 373
19
- },
20
- {
21
- "epoch": 1.3404825737265416,
22
- "grad_norm": 10.78962230682373,
23
- "learning_rate": 1.458365518757964e-05,
24
- "loss": 0.5168,
25
  "step": 500
26
  },
27
  {
28
- "epoch": 2.0,
29
- "eval_accuracy": 0.7727120518684387,
30
- "eval_loss": 0.4883217215538025,
31
- "eval_runtime": 9.0285,
32
- "eval_samples_per_second": 330.398,
33
- "eval_steps_per_second": 20.712,
34
  "step": 746
35
  },
36
  {
37
- "epoch": 2.680965147453083,
38
- "grad_norm": 10.365610122680664,
39
- "learning_rate": 1.2326123424796413e-05,
40
- "loss": 0.2874,
41
  "step": 1000
42
  }
43
  ],
44
  "logging_steps": 500,
45
- "max_steps": 3730,
46
  "num_input_tokens_seen": 0,
47
  "num_train_epochs": 10,
48
  "save_steps": 500,
49
- "total_flos": 5069168917756236.0,
50
- "train_batch_size": 32,
51
  "trial_name": null,
52
  "trial_params": {
53
- "learning_rate": 1.6841186950362865e-05,
54
- "per_device_train_batch_size": 32
55
  }
56
  }
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 1.3404825737265416,
5
  "eval_steps": 500,
6
  "global_step": 1000,
7
  "is_hyper_param_search": true,
 
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
+ "epoch": 0.6702412868632708,
13
+ "grad_norm": 6.84100866317749,
14
+ "learning_rate": 2.923321801244596e-06,
15
+ "loss": 0.5942,
 
 
 
 
 
 
 
 
 
16
  "step": 500
17
  },
18
  {
19
+ "epoch": 1.0,
20
+ "eval_accuracy": 0.7338250279426575,
21
+ "eval_loss": 0.5040128231048584,
22
+ "eval_runtime": 8.8539,
23
+ "eval_samples_per_second": 336.913,
24
+ "eval_steps_per_second": 21.121,
25
  "step": 746
26
  },
27
  {
28
+ "epoch": 1.3404825737265416,
29
+ "grad_norm": 10.305294036865234,
30
+ "learning_rate": 2.7133130511551857e-06,
31
+ "loss": 0.5078,
32
  "step": 1000
33
  }
34
  ],
35
  "logging_steps": 500,
36
+ "max_steps": 7460,
37
  "num_input_tokens_seen": 0,
38
  "num_train_epochs": 10,
39
  "save_steps": 500,
40
+ "total_flos": 2380566939876720.0,
41
+ "train_batch_size": 16,
42
  "trial_name": null,
43
  "trial_params": {
44
+ "learning_rate": 3.133330551334007e-06,
45
+ "per_device_train_batch_size": 16
46
  }
47
  }
run-0/checkpoint-1500/trainer_state.json CHANGED
@@ -1,7 +1,7 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 4.021447721179625,
5
  "eval_steps": 500,
6
  "global_step": 1500,
7
  "is_hyper_param_search": true,
@@ -9,73 +9,55 @@
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
- "epoch": 1.0,
13
- "eval_accuracy": 0.7576265335083008,
14
- "eval_loss": 0.4990231990814209,
15
- "eval_runtime": 8.8755,
16
- "eval_samples_per_second": 336.094,
17
- "eval_steps_per_second": 21.069,
18
- "step": 373
19
- },
20
- {
21
- "epoch": 1.3404825737265416,
22
- "grad_norm": 10.78962230682373,
23
- "learning_rate": 1.458365518757964e-05,
24
- "loss": 0.5168,
25
  "step": 500
26
  },
27
  {
28
- "epoch": 2.0,
29
- "eval_accuracy": 0.7727120518684387,
30
- "eval_loss": 0.4883217215538025,
31
- "eval_runtime": 9.0285,
32
- "eval_samples_per_second": 330.398,
33
- "eval_steps_per_second": 20.712,
34
  "step": 746
35
  },
36
  {
37
- "epoch": 2.680965147453083,
38
- "grad_norm": 10.365610122680664,
39
- "learning_rate": 1.2326123424796413e-05,
40
- "loss": 0.2874,
41
  "step": 1000
42
  },
43
  {
44
- "epoch": 3.0,
45
- "eval_accuracy": 0.7720415592193604,
46
- "eval_loss": 0.7083144187927246,
47
- "eval_runtime": 9.0355,
48
- "eval_samples_per_second": 330.141,
49
- "eval_steps_per_second": 20.696,
50
- "step": 1119
51
- },
52
- {
53
- "epoch": 4.0,
54
- "eval_accuracy": 0.7646664381027222,
55
- "eval_loss": 0.9298484921455383,
56
- "eval_runtime": 9.116,
57
- "eval_samples_per_second": 327.226,
58
- "eval_steps_per_second": 20.513,
59
  "step": 1492
60
  },
61
  {
62
- "epoch": 4.021447721179625,
63
- "grad_norm": 2.833449363708496,
64
- "learning_rate": 1.0068591662013187e-05,
65
- "loss": 0.1128,
66
  "step": 1500
67
  }
68
  ],
69
  "logging_steps": 500,
70
- "max_steps": 3730,
71
  "num_input_tokens_seen": 0,
72
  "num_train_epochs": 10,
73
  "save_steps": 500,
74
- "total_flos": 7610944877367948.0,
75
- "train_batch_size": 32,
76
  "trial_name": null,
77
  "trial_params": {
78
- "learning_rate": 1.6841186950362865e-05,
79
- "per_device_train_batch_size": 32
80
  }
81
  }
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 2.0107238605898123,
5
  "eval_steps": 500,
6
  "global_step": 1500,
7
  "is_hyper_param_search": true,
 
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
+ "epoch": 0.6702412868632708,
13
+ "grad_norm": 6.84100866317749,
14
+ "learning_rate": 2.923321801244596e-06,
15
+ "loss": 0.5942,
 
 
 
 
 
 
 
 
 
16
  "step": 500
17
  },
18
  {
19
+ "epoch": 1.0,
20
+ "eval_accuracy": 0.7338250279426575,
21
+ "eval_loss": 0.5040128231048584,
22
+ "eval_runtime": 8.8539,
23
+ "eval_samples_per_second": 336.913,
24
+ "eval_steps_per_second": 21.121,
25
  "step": 746
26
  },
27
  {
28
+ "epoch": 1.3404825737265416,
29
+ "grad_norm": 10.305294036865234,
30
+ "learning_rate": 2.7133130511551857e-06,
31
+ "loss": 0.5078,
32
  "step": 1000
33
  },
34
  {
35
+ "epoch": 2.0,
36
+ "eval_accuracy": 0.7418705821037292,
37
+ "eval_loss": 0.4992324709892273,
38
+ "eval_runtime": 8.9157,
39
+ "eval_samples_per_second": 334.577,
40
+ "eval_steps_per_second": 20.974,
 
 
 
 
 
 
 
 
 
41
  "step": 1492
42
  },
43
  {
44
+ "epoch": 2.0107238605898123,
45
+ "grad_norm": 5.501172065734863,
46
+ "learning_rate": 2.5033043010657747e-06,
47
+ "loss": 0.4546,
48
  "step": 1500
49
  }
50
  ],
51
  "logging_steps": 500,
52
+ "max_steps": 7460,
53
  "num_input_tokens_seen": 0,
54
  "num_train_epochs": 10,
55
  "save_steps": 500,
56
+ "total_flos": 3566922478004628.0,
57
+ "train_batch_size": 16,
58
  "trial_name": null,
59
  "trial_params": {
60
+ "learning_rate": 3.133330551334007e-06,
61
+ "per_device_train_batch_size": 16
62
  }
63
  }
run-0/checkpoint-2000/trainer_state.json CHANGED
@@ -1,7 +1,7 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 5.361930294906166,
5
  "eval_steps": 500,
6
  "global_step": 2000,
7
  "is_hyper_param_search": true,
@@ -9,89 +9,62 @@
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
- "epoch": 1.0,
13
- "eval_accuracy": 0.7576265335083008,
14
- "eval_loss": 0.4990231990814209,
15
- "eval_runtime": 8.8755,
16
- "eval_samples_per_second": 336.094,
17
- "eval_steps_per_second": 21.069,
18
- "step": 373
19
- },
20
- {
21
- "epoch": 1.3404825737265416,
22
- "grad_norm": 10.78962230682373,
23
- "learning_rate": 1.458365518757964e-05,
24
- "loss": 0.5168,
25
  "step": 500
26
  },
27
  {
28
- "epoch": 2.0,
29
- "eval_accuracy": 0.7727120518684387,
30
- "eval_loss": 0.4883217215538025,
31
- "eval_runtime": 9.0285,
32
- "eval_samples_per_second": 330.398,
33
- "eval_steps_per_second": 20.712,
34
  "step": 746
35
  },
36
  {
37
- "epoch": 2.680965147453083,
38
- "grad_norm": 10.365610122680664,
39
- "learning_rate": 1.2326123424796413e-05,
40
- "loss": 0.2874,
41
  "step": 1000
42
  },
43
  {
44
- "epoch": 3.0,
45
- "eval_accuracy": 0.7720415592193604,
46
- "eval_loss": 0.7083144187927246,
47
- "eval_runtime": 9.0355,
48
- "eval_samples_per_second": 330.141,
49
- "eval_steps_per_second": 20.696,
50
- "step": 1119
51
- },
52
- {
53
- "epoch": 4.0,
54
- "eval_accuracy": 0.7646664381027222,
55
- "eval_loss": 0.9298484921455383,
56
- "eval_runtime": 9.116,
57
- "eval_samples_per_second": 327.226,
58
- "eval_steps_per_second": 20.513,
59
  "step": 1492
60
  },
61
  {
62
- "epoch": 4.021447721179625,
63
- "grad_norm": 2.833449363708496,
64
- "learning_rate": 1.0068591662013187e-05,
65
- "loss": 0.1128,
66
  "step": 1500
67
  },
68
  {
69
- "epoch": 5.0,
70
- "eval_accuracy": 0.7696949243545532,
71
- "eval_loss": 1.2057135105133057,
72
- "eval_runtime": 9.0846,
73
- "eval_samples_per_second": 328.357,
74
- "eval_steps_per_second": 20.584,
75
- "step": 1865
76
- },
77
- {
78
- "epoch": 5.361930294906166,
79
- "grad_norm": 0.6012887954711914,
80
- "learning_rate": 7.811059899229962e-06,
81
- "loss": 0.044,
82
  "step": 2000
83
  }
84
  ],
85
  "logging_steps": 500,
86
- "max_steps": 3730,
87
  "num_input_tokens_seen": 0,
88
  "num_train_epochs": 10,
89
  "save_steps": 500,
90
- "total_flos": 1.01458223839854e+16,
91
- "train_batch_size": 32,
92
  "trial_name": null,
93
  "trial_params": {
94
- "learning_rate": 1.6841186950362865e-05,
95
- "per_device_train_batch_size": 32
96
  }
97
  }
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 2.680965147453083,
5
  "eval_steps": 500,
6
  "global_step": 2000,
7
  "is_hyper_param_search": true,
 
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
+ "epoch": 0.6702412868632708,
13
+ "grad_norm": 6.84100866317749,
14
+ "learning_rate": 2.923321801244596e-06,
15
+ "loss": 0.5942,
 
 
 
 
 
 
 
 
 
16
  "step": 500
17
  },
18
  {
19
+ "epoch": 1.0,
20
+ "eval_accuracy": 0.7338250279426575,
21
+ "eval_loss": 0.5040128231048584,
22
+ "eval_runtime": 8.8539,
23
+ "eval_samples_per_second": 336.913,
24
+ "eval_steps_per_second": 21.121,
25
  "step": 746
26
  },
27
  {
28
+ "epoch": 1.3404825737265416,
29
+ "grad_norm": 10.305294036865234,
30
+ "learning_rate": 2.7133130511551857e-06,
31
+ "loss": 0.5078,
32
  "step": 1000
33
  },
34
  {
35
+ "epoch": 2.0,
36
+ "eval_accuracy": 0.7418705821037292,
37
+ "eval_loss": 0.4992324709892273,
38
+ "eval_runtime": 8.9157,
39
+ "eval_samples_per_second": 334.577,
40
+ "eval_steps_per_second": 20.974,
 
 
 
 
 
 
 
 
 
41
  "step": 1492
42
  },
43
  {
44
+ "epoch": 2.0107238605898123,
45
+ "grad_norm": 5.501172065734863,
46
+ "learning_rate": 2.5033043010657747e-06,
47
+ "loss": 0.4546,
48
  "step": 1500
49
  },
50
  {
51
+ "epoch": 2.680965147453083,
52
+ "grad_norm": 7.48345947265625,
53
+ "learning_rate": 2.293295550976364e-06,
54
+ "loss": 0.3872,
 
 
 
 
 
 
 
 
 
55
  "step": 2000
56
  }
57
  ],
58
  "logging_steps": 500,
59
+ "max_steps": 7460,
60
  "num_input_tokens_seen": 0,
61
  "num_train_epochs": 10,
62
  "save_steps": 500,
63
+ "total_flos": 4761479712489300.0,
64
+ "train_batch_size": 16,
65
  "trial_name": null,
66
  "trial_params": {
67
+ "learning_rate": 3.133330551334007e-06,
68
+ "per_device_train_batch_size": 16
69
  }
70
  }
run-0/checkpoint-2500/trainer_state.json CHANGED
@@ -1,7 +1,7 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 6.702412868632708,
5
  "eval_steps": 500,
6
  "global_step": 2500,
7
  "is_hyper_param_search": true,
@@ -9,105 +9,78 @@
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
- "epoch": 1.0,
13
- "eval_accuracy": 0.7576265335083008,
14
- "eval_loss": 0.4990231990814209,
15
- "eval_runtime": 8.8755,
16
- "eval_samples_per_second": 336.094,
17
- "eval_steps_per_second": 21.069,
18
- "step": 373
19
- },
20
- {
21
- "epoch": 1.3404825737265416,
22
- "grad_norm": 10.78962230682373,
23
- "learning_rate": 1.458365518757964e-05,
24
- "loss": 0.5168,
25
  "step": 500
26
  },
27
  {
28
- "epoch": 2.0,
29
- "eval_accuracy": 0.7727120518684387,
30
- "eval_loss": 0.4883217215538025,
31
- "eval_runtime": 9.0285,
32
- "eval_samples_per_second": 330.398,
33
- "eval_steps_per_second": 20.712,
34
  "step": 746
35
  },
36
  {
37
- "epoch": 2.680965147453083,
38
- "grad_norm": 10.365610122680664,
39
- "learning_rate": 1.2326123424796413e-05,
40
- "loss": 0.2874,
41
  "step": 1000
42
  },
43
  {
44
- "epoch": 3.0,
45
- "eval_accuracy": 0.7720415592193604,
46
- "eval_loss": 0.7083144187927246,
47
- "eval_runtime": 9.0355,
48
- "eval_samples_per_second": 330.141,
49
- "eval_steps_per_second": 20.696,
50
- "step": 1119
51
- },
52
- {
53
- "epoch": 4.0,
54
- "eval_accuracy": 0.7646664381027222,
55
- "eval_loss": 0.9298484921455383,
56
- "eval_runtime": 9.116,
57
- "eval_samples_per_second": 327.226,
58
- "eval_steps_per_second": 20.513,
59
  "step": 1492
60
  },
61
  {
62
- "epoch": 4.021447721179625,
63
- "grad_norm": 2.833449363708496,
64
- "learning_rate": 1.0068591662013187e-05,
65
- "loss": 0.1128,
66
  "step": 1500
67
  },
68
  {
69
- "epoch": 5.0,
70
- "eval_accuracy": 0.7696949243545532,
71
- "eval_loss": 1.2057135105133057,
72
- "eval_runtime": 9.0846,
73
- "eval_samples_per_second": 328.357,
74
- "eval_steps_per_second": 20.584,
75
- "step": 1865
76
- },
77
- {
78
- "epoch": 5.361930294906166,
79
- "grad_norm": 0.6012887954711914,
80
- "learning_rate": 7.811059899229962e-06,
81
- "loss": 0.044,
82
  "step": 2000
83
  },
84
  {
85
- "epoch": 6.0,
86
- "eval_accuracy": 0.7753939032554626,
87
- "eval_loss": 1.2873387336730957,
88
- "eval_runtime": 9.0691,
89
- "eval_samples_per_second": 328.919,
90
- "eval_steps_per_second": 20.619,
91
  "step": 2238
92
  },
93
  {
94
- "epoch": 6.702412868632708,
95
- "grad_norm": 0.0961245447397232,
96
- "learning_rate": 5.553528136446735e-06,
97
- "loss": 0.027,
98
  "step": 2500
99
  }
100
  ],
101
  "logging_steps": 500,
102
- "max_steps": 3730,
103
  "num_input_tokens_seen": 0,
104
  "num_train_epochs": 10,
105
  "save_steps": 500,
106
- "total_flos": 1.2690172067220828e+16,
107
- "train_batch_size": 32,
108
  "trial_name": null,
109
  "trial_params": {
110
- "learning_rate": 1.6841186950362865e-05,
111
- "per_device_train_batch_size": 32
112
  }
113
  }
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 3.351206434316354,
5
  "eval_steps": 500,
6
  "global_step": 2500,
7
  "is_hyper_param_search": true,
 
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
+ "epoch": 0.6702412868632708,
13
+ "grad_norm": 6.84100866317749,
14
+ "learning_rate": 2.923321801244596e-06,
15
+ "loss": 0.5942,
 
 
 
 
 
 
 
 
 
16
  "step": 500
17
  },
18
  {
19
+ "epoch": 1.0,
20
+ "eval_accuracy": 0.7338250279426575,
21
+ "eval_loss": 0.5040128231048584,
22
+ "eval_runtime": 8.8539,
23
+ "eval_samples_per_second": 336.913,
24
+ "eval_steps_per_second": 21.121,
25
  "step": 746
26
  },
27
  {
28
+ "epoch": 1.3404825737265416,
29
+ "grad_norm": 10.305294036865234,
30
+ "learning_rate": 2.7133130511551857e-06,
31
+ "loss": 0.5078,
32
  "step": 1000
33
  },
34
  {
35
+ "epoch": 2.0,
36
+ "eval_accuracy": 0.7418705821037292,
37
+ "eval_loss": 0.4992324709892273,
38
+ "eval_runtime": 8.9157,
39
+ "eval_samples_per_second": 334.577,
40
+ "eval_steps_per_second": 20.974,
 
 
 
 
 
 
 
 
 
41
  "step": 1492
42
  },
43
  {
44
+ "epoch": 2.0107238605898123,
45
+ "grad_norm": 5.501172065734863,
46
+ "learning_rate": 2.5033043010657747e-06,
47
+ "loss": 0.4546,
48
  "step": 1500
49
  },
50
  {
51
+ "epoch": 2.680965147453083,
52
+ "grad_norm": 7.48345947265625,
53
+ "learning_rate": 2.293295550976364e-06,
54
+ "loss": 0.3872,
 
 
 
 
 
 
 
 
 
55
  "step": 2000
56
  },
57
  {
58
+ "epoch": 3.0,
59
+ "eval_accuracy": 0.7529332637786865,
60
+ "eval_loss": 0.5125850439071655,
61
+ "eval_runtime": 8.8675,
62
+ "eval_samples_per_second": 336.395,
63
+ "eval_steps_per_second": 21.088,
64
  "step": 2238
65
  },
66
  {
67
+ "epoch": 3.351206434316354,
68
+ "grad_norm": 8.878561973571777,
69
+ "learning_rate": 2.0832868008869536e-06,
70
+ "loss": 0.3551,
71
  "step": 2500
72
  }
73
  ],
74
  "logging_steps": 500,
75
+ "max_steps": 7460,
76
  "num_input_tokens_seen": 0,
77
  "num_train_epochs": 10,
78
  "save_steps": 500,
79
+ "total_flos": 5940423509036040.0,
80
+ "train_batch_size": 16,
81
  "trial_name": null,
82
  "trial_params": {
83
+ "learning_rate": 3.133330551334007e-06,
84
+ "per_device_train_batch_size": 16
85
  }
86
  }
run-0/checkpoint-3000/trainer_state.json CHANGED
@@ -1,7 +1,7 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 8.04289544235925,
5
  "eval_steps": 500,
6
  "global_step": 3000,
7
  "is_hyper_param_search": true,
@@ -9,130 +9,94 @@
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
- "epoch": 1.0,
13
- "eval_accuracy": 0.7576265335083008,
14
- "eval_loss": 0.4990231990814209,
15
- "eval_runtime": 8.8755,
16
- "eval_samples_per_second": 336.094,
17
- "eval_steps_per_second": 21.069,
18
- "step": 373
19
- },
20
- {
21
- "epoch": 1.3404825737265416,
22
- "grad_norm": 10.78962230682373,
23
- "learning_rate": 1.458365518757964e-05,
24
- "loss": 0.5168,
25
  "step": 500
26
  },
27
  {
28
- "epoch": 2.0,
29
- "eval_accuracy": 0.7727120518684387,
30
- "eval_loss": 0.4883217215538025,
31
- "eval_runtime": 9.0285,
32
- "eval_samples_per_second": 330.398,
33
- "eval_steps_per_second": 20.712,
34
  "step": 746
35
  },
36
  {
37
- "epoch": 2.680965147453083,
38
- "grad_norm": 10.365610122680664,
39
- "learning_rate": 1.2326123424796413e-05,
40
- "loss": 0.2874,
41
  "step": 1000
42
  },
43
  {
44
- "epoch": 3.0,
45
- "eval_accuracy": 0.7720415592193604,
46
- "eval_loss": 0.7083144187927246,
47
- "eval_runtime": 9.0355,
48
- "eval_samples_per_second": 330.141,
49
- "eval_steps_per_second": 20.696,
50
- "step": 1119
51
- },
52
- {
53
- "epoch": 4.0,
54
- "eval_accuracy": 0.7646664381027222,
55
- "eval_loss": 0.9298484921455383,
56
- "eval_runtime": 9.116,
57
- "eval_samples_per_second": 327.226,
58
- "eval_steps_per_second": 20.513,
59
  "step": 1492
60
  },
61
  {
62
- "epoch": 4.021447721179625,
63
- "grad_norm": 2.833449363708496,
64
- "learning_rate": 1.0068591662013187e-05,
65
- "loss": 0.1128,
66
  "step": 1500
67
  },
68
  {
69
- "epoch": 5.0,
70
- "eval_accuracy": 0.7696949243545532,
71
- "eval_loss": 1.2057135105133057,
72
- "eval_runtime": 9.0846,
73
- "eval_samples_per_second": 328.357,
74
- "eval_steps_per_second": 20.584,
75
- "step": 1865
76
- },
77
- {
78
- "epoch": 5.361930294906166,
79
- "grad_norm": 0.6012887954711914,
80
- "learning_rate": 7.811059899229962e-06,
81
- "loss": 0.044,
82
  "step": 2000
83
  },
84
  {
85
- "epoch": 6.0,
86
- "eval_accuracy": 0.7753939032554626,
87
- "eval_loss": 1.2873387336730957,
88
- "eval_runtime": 9.0691,
89
- "eval_samples_per_second": 328.919,
90
- "eval_steps_per_second": 20.619,
91
  "step": 2238
92
  },
93
  {
94
- "epoch": 6.702412868632708,
95
- "grad_norm": 0.0961245447397232,
96
- "learning_rate": 5.553528136446735e-06,
97
- "loss": 0.027,
98
  "step": 2500
99
  },
100
  {
101
- "epoch": 7.0,
102
- "eval_accuracy": 0.7723767757415771,
103
- "eval_loss": 1.348684549331665,
104
- "eval_runtime": 9.0525,
105
- "eval_samples_per_second": 329.521,
106
- "eval_steps_per_second": 20.657,
107
- "step": 2611
108
- },
109
- {
110
- "epoch": 8.0,
111
- "eval_accuracy": 0.7730472683906555,
112
- "eval_loss": 1.3971577882766724,
113
- "eval_runtime": 9.0899,
114
- "eval_samples_per_second": 328.168,
115
- "eval_steps_per_second": 20.572,
116
  "step": 2984
117
  },
118
  {
119
- "epoch": 8.04289544235925,
120
- "grad_norm": 2.5750341415405273,
121
- "learning_rate": 3.2959963736635094e-06,
122
- "loss": 0.0157,
123
  "step": 3000
124
  }
125
  ],
126
  "logging_steps": 500,
127
- "max_steps": 3730,
128
  "num_input_tokens_seen": 0,
129
  "num_train_epochs": 10,
130
  "save_steps": 500,
131
- "total_flos": 1.5221292738223464e+16,
132
- "train_batch_size": 32,
133
  "trial_name": null,
134
  "trial_params": {
135
- "learning_rate": 1.6841186950362865e-05,
136
- "per_device_train_batch_size": 32
137
  }
138
  }
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 4.021447721179625,
5
  "eval_steps": 500,
6
  "global_step": 3000,
7
  "is_hyper_param_search": true,
 
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
+ "epoch": 0.6702412868632708,
13
+ "grad_norm": 6.84100866317749,
14
+ "learning_rate": 2.923321801244596e-06,
15
+ "loss": 0.5942,
 
 
 
 
 
 
 
 
 
16
  "step": 500
17
  },
18
  {
19
+ "epoch": 1.0,
20
+ "eval_accuracy": 0.7338250279426575,
21
+ "eval_loss": 0.5040128231048584,
22
+ "eval_runtime": 8.8539,
23
+ "eval_samples_per_second": 336.913,
24
+ "eval_steps_per_second": 21.121,
25
  "step": 746
26
  },
27
  {
28
+ "epoch": 1.3404825737265416,
29
+ "grad_norm": 10.305294036865234,
30
+ "learning_rate": 2.7133130511551857e-06,
31
+ "loss": 0.5078,
32
  "step": 1000
33
  },
34
  {
35
+ "epoch": 2.0,
36
+ "eval_accuracy": 0.7418705821037292,
37
+ "eval_loss": 0.4992324709892273,
38
+ "eval_runtime": 8.9157,
39
+ "eval_samples_per_second": 334.577,
40
+ "eval_steps_per_second": 20.974,
 
 
 
 
 
 
 
 
 
41
  "step": 1492
42
  },
43
  {
44
+ "epoch": 2.0107238605898123,
45
+ "grad_norm": 5.501172065734863,
46
+ "learning_rate": 2.5033043010657747e-06,
47
+ "loss": 0.4546,
48
  "step": 1500
49
  },
50
  {
51
+ "epoch": 2.680965147453083,
52
+ "grad_norm": 7.48345947265625,
53
+ "learning_rate": 2.293295550976364e-06,
54
+ "loss": 0.3872,
 
 
 
 
 
 
 
 
 
55
  "step": 2000
56
  },
57
  {
58
+ "epoch": 3.0,
59
+ "eval_accuracy": 0.7529332637786865,
60
+ "eval_loss": 0.5125850439071655,
61
+ "eval_runtime": 8.8675,
62
+ "eval_samples_per_second": 336.395,
63
+ "eval_steps_per_second": 21.088,
64
  "step": 2238
65
  },
66
  {
67
+ "epoch": 3.351206434316354,
68
+ "grad_norm": 8.878561973571777,
69
+ "learning_rate": 2.0832868008869536e-06,
70
+ "loss": 0.3551,
71
  "step": 2500
72
  },
73
  {
74
+ "epoch": 4.0,
75
+ "eval_accuracy": 0.750921905040741,
76
+ "eval_loss": 0.5793744921684265,
77
+ "eval_runtime": 8.9125,
78
+ "eval_samples_per_second": 334.698,
79
+ "eval_steps_per_second": 20.982,
 
 
 
 
 
 
 
 
 
80
  "step": 2984
81
  },
82
  {
83
+ "epoch": 4.021447721179625,
84
+ "grad_norm": 17.35097312927246,
85
+ "learning_rate": 1.8732780507975428e-06,
86
+ "loss": 0.3097,
87
  "step": 3000
88
  }
89
  ],
90
  "logging_steps": 500,
91
+ "max_steps": 7460,
92
  "num_input_tokens_seen": 0,
93
  "num_train_epochs": 10,
94
  "save_steps": 500,
95
+ "total_flos": 7128413561883960.0,
96
+ "train_batch_size": 16,
97
  "trial_name": null,
98
  "trial_params": {
99
+ "learning_rate": 3.133330551334007e-06,
100
+ "per_device_train_batch_size": 16
101
  }
102
  }
run-0/checkpoint-500/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:55851977026014f4b784d3a7727f15e1709df53ce1e653365a304fbaa1c8c8a4
3
  size 1340618660
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:38879bf812f2ea865a112a9cd8fdd63fcd1d9e3b5480262ad99fdeb8669aebbb
3
  size 1340618660
run-0/checkpoint-500/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:6fddeffafcc1d70abcdc6aa92e96da5b7496c55f91c07a60fb32ac5af7ff783d
3
  size 2681472237
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2e07eb37769cd2feb155c6b916975aec48e551c76eda4b6fafd9cf1ca7c3bca7
3
  size 2681472237
run-0/checkpoint-500/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d9bbf563591c5fe3fe29dd3bd70b2dd79355243b58fe1d041144c9bfbba0d18a
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b39308b250b2dde21cca6217a709d5456bcdcab3c796c6926f25c06b9c730de1
3
  size 14244
run-0/checkpoint-500/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:3dc2b58cb6162128ce96179719f9e4590068a0401c821507351f2c446cb729d0
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ae38da46bf22e39dc164f74878f73734c60df1d2faf2ab7463738d2ec368d0d2
3
  size 1064
run-0/checkpoint-500/trainer_state.json CHANGED
@@ -1,7 +1,7 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 1.3404825737265416,
5
  "eval_steps": 500,
6
  "global_step": 500,
7
  "is_hyper_param_search": true,
@@ -9,32 +9,23 @@
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
- "epoch": 1.0,
13
- "eval_accuracy": 0.7576265335083008,
14
- "eval_loss": 0.4990231990814209,
15
- "eval_runtime": 8.8755,
16
- "eval_samples_per_second": 336.094,
17
- "eval_steps_per_second": 21.069,
18
- "step": 373
19
- },
20
- {
21
- "epoch": 1.3404825737265416,
22
- "grad_norm": 10.78962230682373,
23
- "learning_rate": 1.458365518757964e-05,
24
- "loss": 0.5168,
25
  "step": 500
26
  }
27
  ],
28
  "logging_steps": 500,
29
- "max_steps": 3730,
30
  "num_input_tokens_seen": 0,
31
  "num_train_epochs": 10,
32
  "save_steps": 500,
33
- "total_flos": 2540028594209472.0,
34
- "train_batch_size": 32,
35
  "trial_name": null,
36
  "trial_params": {
37
- "learning_rate": 1.6841186950362865e-05,
38
- "per_device_train_batch_size": 32
39
  }
40
  }
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 0.6702412868632708,
5
  "eval_steps": 500,
6
  "global_step": 500,
7
  "is_hyper_param_search": true,
 
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
+ "epoch": 0.6702412868632708,
13
+ "grad_norm": 6.84100866317749,
14
+ "learning_rate": 2.923321801244596e-06,
15
+ "loss": 0.5942,
 
 
 
 
 
 
 
 
 
16
  "step": 500
17
  }
18
  ],
19
  "logging_steps": 500,
20
+ "max_steps": 7460,
21
  "num_input_tokens_seen": 0,
22
  "num_train_epochs": 10,
23
  "save_steps": 500,
24
+ "total_flos": 1185529179906432.0,
25
+ "train_batch_size": 16,
26
  "trial_name": null,
27
  "trial_params": {
28
+ "learning_rate": 3.133330551334007e-06,
29
+ "per_device_train_batch_size": 16
30
  }
31
  }
run-0/checkpoint-500/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:15189c3d12b5f903b32dcdb8f142fc9f600c7b03c5e75247ee6c5c612e40a65d
3
  size 5048
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d85cbcd0df5fe052a5199b6aa6fab09a1050a56fe5d4201688db94d30085dc57
3
  size 5048
run-8/checkpoint-1000/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:577e69a7a059c3946386d8325e043ba9bc056922d537a8724502c81a66f6062d
3
  size 1340618660
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9461766d71cc52d818864796c1ea2b597890ec358346b294c381abf16bdf82cf
3
  size 1340618660
run-8/checkpoint-1000/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b329fae6655c0858c21fd312f51a6010c98395d3932cdf3cd99cc5ddec7555f0
3
  size 2681472237
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:35e8bde458201cb88b3dda2faf9d7d4709af674b359a94f682f1f61df90161f1
3
  size 2681472237
run-8/checkpoint-1000/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b8a41427ba0c00138131c0332df47016c0febb7eb26ac8c48be842afab125544
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f15ef18f77678b087c8d8d016723a03f0d0f4e837051c67af83ea5404cfb3e09
3
  size 14244
run-8/checkpoint-1000/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:a462e315269088efc372b3ada630829a976fe7a09ee980f2292b5bc774303f4c
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2213efd67cc97cffa04c26ac0b875fd692518dbd7931cbc0273bda3d0fccf497
3
  size 1064
run-8/checkpoint-1000/trainer_state.json CHANGED
@@ -1,7 +1,7 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 5.347593582887701,
5
  "eval_steps": 500,
6
  "global_step": 1000,
7
  "is_hyper_param_search": true,
@@ -10,74 +10,47 @@
10
  "log_history": [
11
  {
12
  "epoch": 1.0,
13
- "eval_accuracy": 0.7566208243370056,
14
- "eval_loss": 0.4764082133769989,
15
- "eval_runtime": 8.6013,
16
- "eval_samples_per_second": 346.81,
17
- "eval_steps_per_second": 21.741,
18
- "step": 187
19
  },
20
  {
21
- "epoch": 2.0,
22
- "eval_accuracy": 0.7707006335258484,
23
- "eval_loss": 0.4789218306541443,
24
- "eval_runtime": 8.7284,
25
- "eval_samples_per_second": 341.757,
26
- "eval_steps_per_second": 21.424,
27
- "step": 374
28
- },
29
- {
30
- "epoch": 2.6737967914438503,
31
- "grad_norm": 5.854825496673584,
32
- "learning_rate": 1.3210882896809641e-05,
33
- "loss": 0.3976,
34
  "step": 500
35
  },
36
  {
37
- "epoch": 3.0,
38
- "eval_accuracy": 0.7670130729675293,
39
- "eval_loss": 0.5469409823417664,
40
- "eval_runtime": 8.6858,
41
- "eval_samples_per_second": 343.435,
42
- "eval_steps_per_second": 21.529,
43
- "step": 561
44
- },
45
- {
46
- "epoch": 4.0,
47
- "eval_accuracy": 0.7727120518684387,
48
- "eval_loss": 0.7016632556915283,
49
- "eval_runtime": 8.7334,
50
- "eval_samples_per_second": 341.563,
51
- "eval_steps_per_second": 21.412,
52
- "step": 748
53
- },
54
- {
55
- "epoch": 5.0,
56
- "eval_accuracy": 0.7750586867332458,
57
- "eval_loss": 0.8148671984672546,
58
- "eval_runtime": 8.7283,
59
- "eval_samples_per_second": 341.762,
60
- "eval_steps_per_second": 21.425,
61
- "step": 935
62
  },
63
  {
64
- "epoch": 5.347593582887701,
65
- "grad_norm": 3.247746467590332,
66
- "learning_rate": 8.389392788484955e-06,
67
- "loss": 0.0984,
68
  "step": 1000
69
  }
70
  ],
71
  "logging_steps": 500,
72
- "max_steps": 1870,
73
  "num_input_tokens_seen": 0,
74
  "num_train_epochs": 10,
75
  "save_steps": 500,
76
- "total_flos": 1.3339838883429072e+16,
77
- "train_batch_size": 64,
78
  "trial_name": null,
79
  "trial_params": {
80
- "learning_rate": 1.8032373005134327e-05,
81
- "per_device_train_batch_size": 64
82
  }
83
  }
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 2.680965147453083,
5
  "eval_steps": 500,
6
  "global_step": 1000,
7
  "is_hyper_param_search": true,
 
10
  "log_history": [
11
  {
12
  "epoch": 1.0,
13
+ "eval_accuracy": 0.7653369307518005,
14
+ "eval_loss": 0.4685952365398407,
15
+ "eval_runtime": 9.0759,
16
+ "eval_samples_per_second": 328.672,
17
+ "eval_steps_per_second": 20.604,
18
+ "step": 373
19
  },
20
  {
21
+ "epoch": 1.3404825737265416,
22
+ "grad_norm": 11.95083236694336,
23
+ "learning_rate": 1.278490707659754e-05,
24
+ "loss": 0.4881,
 
 
 
 
 
 
 
 
 
25
  "step": 500
26
  },
27
  {
28
+ "epoch": 2.0,
29
+ "eval_accuracy": 0.7707006335258484,
30
+ "eval_loss": 0.47391676902770996,
31
+ "eval_runtime": 9.0389,
32
+ "eval_samples_per_second": 330.018,
33
+ "eval_steps_per_second": 20.688,
34
+ "step": 746
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
35
  },
36
  {
37
+ "epoch": 2.680965147453083,
38
+ "grad_norm": 15.537776947021484,
39
+ "learning_rate": 1.0805819293842502e-05,
40
+ "loss": 0.2385,
41
  "step": 1000
42
  }
43
  ],
44
  "logging_steps": 500,
45
+ "max_steps": 3730,
46
  "num_input_tokens_seen": 0,
47
  "num_train_epochs": 10,
48
  "save_steps": 500,
49
+ "total_flos": 8968123585287756.0,
50
+ "train_batch_size": 32,
51
  "trial_name": null,
52
  "trial_params": {
53
+ "learning_rate": 1.4763994859352575e-05,
54
+ "per_device_train_batch_size": 32
55
  }
56
  }
run-8/checkpoint-1000/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:3cd79611d1e9a2b1e29f7d4a2137e798ae426a72939938919f5f2a5833b63c02
3
  size 5048
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:39e6a33bf8c21d040f79b518acc997a1f817992f979b28f2a817ab72bcaeabb9
3
  size 5048
run-8/checkpoint-500/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:34803c7ffa3413f8d688edc6ea5c974e8d7f56a690438a4dca385053434edd91
3
  size 1340618660
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a84157c5c45e1042099f88bc965875ace1cd7c98953b8ff27ad67b1cf72b8f02
3
  size 1340618660
run-8/checkpoint-500/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f97b1b6cfba0d414fcf63e15f168aedc51d7a06b4327a40be9ab2c868bbe3638
3
  size 2681472237
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4c751026411b0d6187a0f16f90634f79a35b4955a8297a16a2c68ab30b7da4cc
3
  size 2681472237
run-8/checkpoint-500/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b5c01a35b53aad6d494f823e04325bafd8b1267be264b956fed93103db217bd6
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d9bbf563591c5fe3fe29dd3bd70b2dd79355243b58fe1d041144c9bfbba0d18a
3
  size 14244
run-8/checkpoint-500/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d53f8274a94cf5eebbf0b779b41dfd01b31be3f4918ec13c3fbc57b670527966
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:50a716b0a7dd5d9fc9bf3d42178286c830bd227ae23513d8ebb2eb9d208dd3b1
3
  size 1064
run-8/checkpoint-500/trainer_state.json CHANGED
@@ -1,7 +1,7 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 2.6737967914438503,
5
  "eval_steps": 500,
6
  "global_step": 500,
7
  "is_hyper_param_search": true,
@@ -10,40 +10,31 @@
10
  "log_history": [
11
  {
12
  "epoch": 1.0,
13
- "eval_accuracy": 0.7566208243370056,
14
- "eval_loss": 0.4764082133769989,
15
- "eval_runtime": 8.6013,
16
- "eval_samples_per_second": 346.81,
17
- "eval_steps_per_second": 21.741,
18
- "step": 187
19
  },
20
  {
21
- "epoch": 2.0,
22
- "eval_accuracy": 0.7707006335258484,
23
- "eval_loss": 0.4789218306541443,
24
- "eval_runtime": 8.7284,
25
- "eval_samples_per_second": 341.757,
26
- "eval_steps_per_second": 21.424,
27
- "step": 374
28
- },
29
- {
30
- "epoch": 2.6737967914438503,
31
- "grad_norm": 5.854825496673584,
32
- "learning_rate": 1.3210882896809641e-05,
33
- "loss": 0.3976,
34
  "step": 500
35
  }
36
  ],
37
  "logging_steps": 500,
38
- "max_steps": 1870,
39
  "num_input_tokens_seen": 0,
40
  "num_train_epochs": 10,
41
  "save_steps": 500,
42
- "total_flos": 7927563847909248.0,
43
- "train_batch_size": 64,
44
  "trial_name": null,
45
  "trial_params": {
46
- "learning_rate": 1.8032373005134327e-05,
47
- "per_device_train_batch_size": 64
48
  }
49
  }
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 1.3404825737265416,
5
  "eval_steps": 500,
6
  "global_step": 500,
7
  "is_hyper_param_search": true,
 
10
  "log_history": [
11
  {
12
  "epoch": 1.0,
13
+ "eval_accuracy": 0.7653369307518005,
14
+ "eval_loss": 0.4685952365398407,
15
+ "eval_runtime": 9.0759,
16
+ "eval_samples_per_second": 328.672,
17
+ "eval_steps_per_second": 20.604,
18
+ "step": 373
19
  },
20
  {
21
+ "epoch": 1.3404825737265416,
22
+ "grad_norm": 11.95083236694336,
23
+ "learning_rate": 1.278490707659754e-05,
24
+ "loss": 0.4881,
 
 
 
 
 
 
 
 
 
25
  "step": 500
26
  }
27
  ],
28
  "logging_steps": 500,
29
+ "max_steps": 3730,
30
  "num_input_tokens_seen": 0,
31
  "num_train_epochs": 10,
32
  "save_steps": 500,
33
+ "total_flos": 6438983261740992.0,
34
+ "train_batch_size": 32,
35
  "trial_name": null,
36
  "trial_params": {
37
+ "learning_rate": 1.4763994859352575e-05,
38
+ "per_device_train_batch_size": 32
39
  }
40
  }
run-8/checkpoint-500/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:3cd79611d1e9a2b1e29f7d4a2137e798ae426a72939938919f5f2a5833b63c02
3
  size 5048
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:39e6a33bf8c21d040f79b518acc997a1f817992f979b28f2a817ab72bcaeabb9
3
  size 5048
run-9/checkpoint-500/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c3baea984c83f240026d0be9a5376c8d95a32b95802239403caae61b8dbc63c5
3
  size 1340618660
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9159572245f0035f61d1cd4bfed00891c512f31df58b3a9e65a62eb65c9bfc4c
3
  size 1340618660
run-9/checkpoint-500/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b30ca04a6543322e6d924764ec640fb6321ca1d1d08caa6845decb1ec54ac9cf
3
  size 2681472237
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:12ea73035baf6b4832de530ef21ee74c16b9cde8f8659d675f47a0ac2f1e7a2f
3
  size 2681472237
run-9/checkpoint-500/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:addf9ea78940efee8fedf34a91f3e64300d9a530d1afa572ef52fa1ce5da865e
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f544045d8bc120b0bef3c491fba9f1ed6efda96a8fe519bf19d9f17a0a9934ac
3
  size 14244
run-9/checkpoint-500/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:224874eac5a18c3df5b2765ef3f31836618965e5c51583216d4d7041cacd3732
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2ad97d024699994a66c55029ecab90d6ae8a0afacb2f8c1b3035f7581d29a7d6
3
  size 1064
run-9/checkpoint-500/trainer_state.json CHANGED
@@ -1,7 +1,7 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 1.3404825737265416,
5
  "eval_steps": 500,
6
  "global_step": 500,
7
  "is_hyper_param_search": true,
@@ -9,32 +9,23 @@
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
- "epoch": 1.0,
13
- "eval_accuracy": 0.7328193187713623,
14
- "eval_loss": 0.5106493234634399,
15
- "eval_runtime": 8.5744,
16
- "eval_samples_per_second": 347.895,
17
- "eval_steps_per_second": 21.809,
18
- "step": 373
19
- },
20
- {
21
- "epoch": 1.3404825737265416,
22
- "grad_norm": 7.487569332122803,
23
- "learning_rate": 3.217425242554135e-06,
24
- "loss": 0.5698,
25
  "step": 500
26
  }
27
  ],
28
  "logging_steps": 500,
29
- "max_steps": 3730,
30
  "num_input_tokens_seen": 0,
31
  "num_train_epochs": 10,
32
  "save_steps": 500,
33
- "total_flos": 2532645975385008.0,
34
- "train_batch_size": 32,
35
  "trial_name": null,
36
  "trial_params": {
37
- "learning_rate": 3.71547868567397e-06,
38
- "per_device_train_batch_size": 32
39
  }
40
  }
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 0.6702412868632708,
5
  "eval_steps": 500,
6
  "global_step": 500,
7
  "is_hyper_param_search": true,
 
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
+ "epoch": 0.6702412868632708,
13
+ "grad_norm": 15.942371368408203,
14
+ "learning_rate": 5.489232390518381e-06,
15
+ "loss": 0.5696,
 
 
 
 
 
 
 
 
 
16
  "step": 500
17
  }
18
  ],
19
  "logging_steps": 500,
20
+ "max_steps": 7460,
21
  "num_input_tokens_seen": 0,
22
  "num_train_epochs": 10,
23
  "save_steps": 500,
24
+ "total_flos": 1796655667961520.0,
25
+ "train_batch_size": 16,
26
  "trial_name": null,
27
  "trial_params": {
28
+ "learning_rate": 5.883573797883207e-06,
29
+ "per_device_train_batch_size": 16
30
  }
31
  }
run-9/checkpoint-500/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:fa93be00dadaea06ce272a0a4e2225ef5e788b629b9b78b1acade2295ab896bb
3
  size 5048
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cf4b65fc1c8f99c657c41e9ca3c93e5969bd485dba5c1c7339b1ef8c60c61e8c
3
  size 5048
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:a843713c73e12f359184fae200bd4db35c7e342723c6e05d613c5eed17ea8f97
3
  size 5048
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d85cbcd0df5fe052a5199b6aa6fab09a1050a56fe5d4201688db94d30085dc57
3
  size 5048