simonmok commited on
Commit
c9014bf
1 Parent(s): 1e9e11a

Training in progress, step 2500

Browse files
model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:20677a90a3902cf3ed56f86f8b03f6bea9f7c430589a4a8bb04004321c23daab
3
  size 268290900
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:483d35db5a9e0274d13ceb4de46f67de2ae9de87d1c21da4b62e40ba3bbc1f19
3
  size 268290900
run-0/checkpoint-2000/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:810e8876309e6d15ec5e71c08ce62cf51942173ed66e3332d73d89182138b579
3
  size 268290900
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d3d640f9bf98101fcc034503bc61321b86543de3a0036482b49086c4f7da396b
3
  size 268290900
run-0/checkpoint-2000/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:722f488eaad699d0c12b887a8c7947a6a9c826ce769623df0f7c576f69ccd0e1
3
  size 536643898
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:610924d4ae41c92e1ad4bd4f7c4e02c16664d9aebde0a8bdd446077ed8454b61
3
  size 536643898
run-0/checkpoint-2000/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e5d96ed9889a2a02e4c7dbb49c09c58f3c154dfd2cb3d0452d915d37d9ed5e34
3
  size 5368
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:235bcd30eb5caaf6d85f48a7cbef42afd59119224ef62ab684da9f5c869126f8
3
  size 5368
run-0/checkpoint-2500/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:a246e5230d38af6cf9f93cee2665907d5e74a83c1a33d7b5c442ec611b802e59
3
  size 268290900
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:483d35db5a9e0274d13ceb4de46f67de2ae9de87d1c21da4b62e40ba3bbc1f19
3
  size 268290900
run-0/checkpoint-2500/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:8d979e8c2f1b7dba595846d46141e59fc1aa4942a71fbda98924d26079cf7b75
3
  size 536643898
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3014391ab26e9b29773098600d9ab8455dbbaa9458a51dee7201d109b305abd4
3
  size 536643898
run-0/checkpoint-2500/trainer_state.json CHANGED
@@ -10,100 +10,100 @@
10
  "log_history": [
11
  {
12
  "epoch": 1.0,
13
- "eval_accuracy": 0.5903225806451613,
14
- "eval_loss": 0.19605161249637604,
15
- "eval_runtime": 5.6751,
16
- "eval_samples_per_second": 546.248,
17
- "eval_steps_per_second": 11.454,
18
  "step": 318
19
  },
20
  {
21
  "epoch": 1.5723270440251573,
22
- "grad_norm": 0.4525008201599121,
23
  "learning_rate": 1.685534591194969e-05,
24
- "loss": 0.3162,
25
  "step": 500
26
  },
27
  {
28
  "epoch": 2.0,
29
- "eval_accuracy": 0.827741935483871,
30
- "eval_loss": 0.09383596479892731,
31
- "eval_runtime": 5.6161,
32
- "eval_samples_per_second": 551.986,
33
- "eval_steps_per_second": 11.574,
34
  "step": 636
35
  },
36
  {
37
  "epoch": 3.0,
38
- "eval_accuracy": 0.8870967741935484,
39
- "eval_loss": 0.06216968968510628,
40
- "eval_runtime": 5.3402,
41
- "eval_samples_per_second": 580.504,
42
- "eval_steps_per_second": 12.172,
43
  "step": 954
44
  },
45
  {
46
  "epoch": 3.1446540880503147,
47
- "grad_norm": 0.5138508081436157,
48
  "learning_rate": 1.371069182389937e-05,
49
- "loss": 0.1093,
50
  "step": 1000
51
  },
52
  {
53
  "epoch": 4.0,
54
- "eval_accuracy": 0.8974193548387097,
55
- "eval_loss": 0.04743651673197746,
56
- "eval_runtime": 5.4223,
57
- "eval_samples_per_second": 571.718,
58
- "eval_steps_per_second": 11.988,
59
  "step": 1272
60
  },
61
  {
62
  "epoch": 4.716981132075472,
63
- "grad_norm": 0.3047187328338623,
64
  "learning_rate": 1.0566037735849058e-05,
65
- "loss": 0.0688,
66
  "step": 1500
67
  },
68
  {
69
  "epoch": 5.0,
70
- "eval_accuracy": 0.9132258064516129,
71
- "eval_loss": 0.03883032500743866,
72
- "eval_runtime": 5.5799,
73
- "eval_samples_per_second": 555.57,
74
- "eval_steps_per_second": 11.649,
75
  "step": 1590
76
  },
77
  {
78
  "epoch": 6.0,
79
- "eval_accuracy": 0.917741935483871,
80
- "eval_loss": 0.03374071046710014,
81
- "eval_runtime": 5.3892,
82
- "eval_samples_per_second": 575.22,
83
- "eval_steps_per_second": 12.061,
84
  "step": 1908
85
  },
86
  {
87
  "epoch": 6.289308176100629,
88
- "grad_norm": 0.22965875267982483,
89
  "learning_rate": 7.421383647798742e-06,
90
- "loss": 0.0537,
91
  "step": 2000
92
  },
93
  {
94
  "epoch": 7.0,
95
- "eval_accuracy": 0.9209677419354839,
96
- "eval_loss": 0.03097311407327652,
97
- "eval_runtime": 5.5045,
98
- "eval_samples_per_second": 563.176,
99
- "eval_steps_per_second": 11.809,
100
  "step": 2226
101
  },
102
  {
103
  "epoch": 7.861635220125786,
104
- "grad_norm": 0.2030441015958786,
105
  "learning_rate": 4.276729559748428e-06,
106
- "loss": 0.0464,
107
  "step": 2500
108
  }
109
  ],
@@ -124,12 +124,12 @@
124
  "attributes": {}
125
  }
126
  },
127
- "total_flos": 651155886807636.0,
128
  "train_batch_size": 48,
129
  "trial_name": null,
130
  "trial_params": {
131
- "alpha": 0.435694601680663,
132
  "num_train_epochs": 10,
133
- "temperature": 12
134
  }
135
  }
 
10
  "log_history": [
11
  {
12
  "epoch": 1.0,
13
+ "eval_accuracy": 0.6106451612903225,
14
+ "eval_loss": 0.2180573046207428,
15
+ "eval_runtime": 5.4236,
16
+ "eval_samples_per_second": 571.576,
17
+ "eval_steps_per_second": 11.985,
18
  "step": 318
19
  },
20
  {
21
  "epoch": 1.5723270440251573,
22
+ "grad_norm": 0.5000836253166199,
23
  "learning_rate": 1.685534591194969e-05,
24
+ "loss": 0.3508,
25
  "step": 500
26
  },
27
  {
28
  "epoch": 2.0,
29
+ "eval_accuracy": 0.8367741935483871,
30
+ "eval_loss": 0.10006564110517502,
31
+ "eval_runtime": 5.9594,
32
+ "eval_samples_per_second": 520.188,
33
+ "eval_steps_per_second": 10.907,
34
  "step": 636
35
  },
36
  {
37
  "epoch": 3.0,
38
+ "eval_accuracy": 0.8880645161290323,
39
+ "eval_loss": 0.06387896835803986,
40
+ "eval_runtime": 5.4554,
41
+ "eval_samples_per_second": 568.241,
42
+ "eval_steps_per_second": 11.915,
43
  "step": 954
44
  },
45
  {
46
  "epoch": 3.1446540880503147,
47
+ "grad_norm": 0.5663716197013855,
48
  "learning_rate": 1.371069182389937e-05,
49
+ "loss": 0.1169,
50
  "step": 1000
51
  },
52
  {
53
  "epoch": 4.0,
54
+ "eval_accuracy": 0.9,
55
+ "eval_loss": 0.04769841209053993,
56
+ "eval_runtime": 5.432,
57
+ "eval_samples_per_second": 570.695,
58
+ "eval_steps_per_second": 11.966,
59
  "step": 1272
60
  },
61
  {
62
  "epoch": 4.716981132075472,
63
+ "grad_norm": 0.3216884136199951,
64
  "learning_rate": 1.0566037735849058e-05,
65
+ "loss": 0.0714,
66
  "step": 1500
67
  },
68
  {
69
  "epoch": 5.0,
70
+ "eval_accuracy": 0.9170967741935484,
71
+ "eval_loss": 0.0384916327893734,
72
+ "eval_runtime": 5.6786,
73
+ "eval_samples_per_second": 545.913,
74
+ "eval_steps_per_second": 11.447,
75
  "step": 1590
76
  },
77
  {
78
  "epoch": 6.0,
79
+ "eval_accuracy": 0.9183870967741935,
80
+ "eval_loss": 0.0333557203412056,
81
+ "eval_runtime": 5.4625,
82
+ "eval_samples_per_second": 567.504,
83
+ "eval_steps_per_second": 11.899,
84
  "step": 1908
85
  },
86
  {
87
  "epoch": 6.289308176100629,
88
+ "grad_norm": 0.24820531904697418,
89
  "learning_rate": 7.421383647798742e-06,
90
+ "loss": 0.055,
91
  "step": 2000
92
  },
93
  {
94
  "epoch": 7.0,
95
+ "eval_accuracy": 0.9245161290322581,
96
+ "eval_loss": 0.030584245920181274,
97
+ "eval_runtime": 5.5639,
98
+ "eval_samples_per_second": 557.166,
99
+ "eval_steps_per_second": 11.683,
100
  "step": 2226
101
  },
102
  {
103
  "epoch": 7.861635220125786,
104
+ "grad_norm": 0.21355891227722168,
105
  "learning_rate": 4.276729559748428e-06,
106
+ "loss": 0.0474,
107
  "step": 2500
108
  }
109
  ],
 
124
  "attributes": {}
125
  }
126
  },
127
+ "total_flos": 756659102663436.0,
128
  "train_batch_size": 48,
129
  "trial_name": null,
130
  "trial_params": {
131
+ "alpha": 0.14463960620576077,
132
  "num_train_epochs": 10,
133
+ "temperature": 6
134
  }
135
  }
run-0/checkpoint-2500/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e5d96ed9889a2a02e4c7dbb49c09c58f3c154dfd2cb3d0452d915d37d9ed5e34
3
  size 5368
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:235bcd30eb5caaf6d85f48a7cbef42afd59119224ef62ab684da9f5c869126f8
3
  size 5368
run-0/checkpoint-3000/trainer_state.json CHANGED
@@ -10,125 +10,125 @@
10
  "log_history": [
11
  {
12
  "epoch": 1.0,
13
- "eval_accuracy": 0.5903225806451613,
14
- "eval_loss": 0.19605161249637604,
15
- "eval_runtime": 5.6751,
16
- "eval_samples_per_second": 546.248,
17
- "eval_steps_per_second": 11.454,
18
  "step": 318
19
  },
20
  {
21
  "epoch": 1.5723270440251573,
22
- "grad_norm": 0.4525008201599121,
23
  "learning_rate": 1.685534591194969e-05,
24
- "loss": 0.3162,
25
  "step": 500
26
  },
27
  {
28
  "epoch": 2.0,
29
- "eval_accuracy": 0.827741935483871,
30
- "eval_loss": 0.09383596479892731,
31
- "eval_runtime": 5.6161,
32
- "eval_samples_per_second": 551.986,
33
- "eval_steps_per_second": 11.574,
34
  "step": 636
35
  },
36
  {
37
  "epoch": 3.0,
38
- "eval_accuracy": 0.8870967741935484,
39
- "eval_loss": 0.06216968968510628,
40
- "eval_runtime": 5.3402,
41
- "eval_samples_per_second": 580.504,
42
- "eval_steps_per_second": 12.172,
43
  "step": 954
44
  },
45
  {
46
  "epoch": 3.1446540880503147,
47
- "grad_norm": 0.5138508081436157,
48
  "learning_rate": 1.371069182389937e-05,
49
- "loss": 0.1093,
50
  "step": 1000
51
  },
52
  {
53
  "epoch": 4.0,
54
- "eval_accuracy": 0.8974193548387097,
55
- "eval_loss": 0.04743651673197746,
56
- "eval_runtime": 5.4223,
57
- "eval_samples_per_second": 571.718,
58
- "eval_steps_per_second": 11.988,
59
  "step": 1272
60
  },
61
  {
62
  "epoch": 4.716981132075472,
63
- "grad_norm": 0.3047187328338623,
64
  "learning_rate": 1.0566037735849058e-05,
65
- "loss": 0.0688,
66
  "step": 1500
67
  },
68
  {
69
  "epoch": 5.0,
70
- "eval_accuracy": 0.9132258064516129,
71
- "eval_loss": 0.03883032500743866,
72
- "eval_runtime": 5.5799,
73
- "eval_samples_per_second": 555.57,
74
- "eval_steps_per_second": 11.649,
75
  "step": 1590
76
  },
77
  {
78
  "epoch": 6.0,
79
- "eval_accuracy": 0.917741935483871,
80
- "eval_loss": 0.03374071046710014,
81
- "eval_runtime": 5.3892,
82
- "eval_samples_per_second": 575.22,
83
- "eval_steps_per_second": 12.061,
84
  "step": 1908
85
  },
86
  {
87
  "epoch": 6.289308176100629,
88
- "grad_norm": 0.22965875267982483,
89
  "learning_rate": 7.421383647798742e-06,
90
- "loss": 0.0537,
91
  "step": 2000
92
  },
93
  {
94
  "epoch": 7.0,
95
- "eval_accuracy": 0.9209677419354839,
96
- "eval_loss": 0.03097311407327652,
97
- "eval_runtime": 5.5045,
98
- "eval_samples_per_second": 563.176,
99
- "eval_steps_per_second": 11.809,
100
  "step": 2226
101
  },
102
  {
103
  "epoch": 7.861635220125786,
104
- "grad_norm": 0.2030441015958786,
105
  "learning_rate": 4.276729559748428e-06,
106
- "loss": 0.0464,
107
  "step": 2500
108
  },
109
  {
110
  "epoch": 8.0,
111
- "eval_accuracy": 0.9238709677419354,
112
- "eval_loss": 0.028694752603769302,
113
- "eval_runtime": 5.6886,
114
- "eval_samples_per_second": 544.953,
115
- "eval_steps_per_second": 11.426,
116
  "step": 2544
117
  },
118
  {
119
  "epoch": 9.0,
120
- "eval_accuracy": 0.927741935483871,
121
- "eval_loss": 0.02778397873044014,
122
- "eval_runtime": 5.3317,
123
- "eval_samples_per_second": 581.426,
124
- "eval_steps_per_second": 12.191,
125
  "step": 2862
126
  },
127
  {
128
  "epoch": 9.433962264150944,
129
- "grad_norm": 0.21865826845169067,
130
  "learning_rate": 1.1320754716981133e-06,
131
- "loss": 0.043,
132
  "step": 3000
133
  }
134
  ],
@@ -149,12 +149,12 @@
149
  "attributes": {}
150
  }
151
  },
152
- "total_flos": 780738843279612.0,
153
  "train_batch_size": 48,
154
  "trial_name": null,
155
  "trial_params": {
156
- "alpha": 0.435694601680663,
157
  "num_train_epochs": 10,
158
- "temperature": 12
159
  }
160
  }
 
10
  "log_history": [
11
  {
12
  "epoch": 1.0,
13
+ "eval_accuracy": 0.6106451612903225,
14
+ "eval_loss": 0.2180573046207428,
15
+ "eval_runtime": 5.4236,
16
+ "eval_samples_per_second": 571.576,
17
+ "eval_steps_per_second": 11.985,
18
  "step": 318
19
  },
20
  {
21
  "epoch": 1.5723270440251573,
22
+ "grad_norm": 0.5000836253166199,
23
  "learning_rate": 1.685534591194969e-05,
24
+ "loss": 0.3508,
25
  "step": 500
26
  },
27
  {
28
  "epoch": 2.0,
29
+ "eval_accuracy": 0.8367741935483871,
30
+ "eval_loss": 0.10006564110517502,
31
+ "eval_runtime": 5.9594,
32
+ "eval_samples_per_second": 520.188,
33
+ "eval_steps_per_second": 10.907,
34
  "step": 636
35
  },
36
  {
37
  "epoch": 3.0,
38
+ "eval_accuracy": 0.8880645161290323,
39
+ "eval_loss": 0.06387896835803986,
40
+ "eval_runtime": 5.4554,
41
+ "eval_samples_per_second": 568.241,
42
+ "eval_steps_per_second": 11.915,
43
  "step": 954
44
  },
45
  {
46
  "epoch": 3.1446540880503147,
47
+ "grad_norm": 0.5663716197013855,
48
  "learning_rate": 1.371069182389937e-05,
49
+ "loss": 0.1169,
50
  "step": 1000
51
  },
52
  {
53
  "epoch": 4.0,
54
+ "eval_accuracy": 0.9,
55
+ "eval_loss": 0.04769841209053993,
56
+ "eval_runtime": 5.432,
57
+ "eval_samples_per_second": 570.695,
58
+ "eval_steps_per_second": 11.966,
59
  "step": 1272
60
  },
61
  {
62
  "epoch": 4.716981132075472,
63
+ "grad_norm": 0.3216884136199951,
64
  "learning_rate": 1.0566037735849058e-05,
65
+ "loss": 0.0714,
66
  "step": 1500
67
  },
68
  {
69
  "epoch": 5.0,
70
+ "eval_accuracy": 0.9170967741935484,
71
+ "eval_loss": 0.0384916327893734,
72
+ "eval_runtime": 5.6786,
73
+ "eval_samples_per_second": 545.913,
74
+ "eval_steps_per_second": 11.447,
75
  "step": 1590
76
  },
77
  {
78
  "epoch": 6.0,
79
+ "eval_accuracy": 0.9183870967741935,
80
+ "eval_loss": 0.0333557203412056,
81
+ "eval_runtime": 5.4625,
82
+ "eval_samples_per_second": 567.504,
83
+ "eval_steps_per_second": 11.899,
84
  "step": 1908
85
  },
86
  {
87
  "epoch": 6.289308176100629,
88
+ "grad_norm": 0.24820531904697418,
89
  "learning_rate": 7.421383647798742e-06,
90
+ "loss": 0.055,
91
  "step": 2000
92
  },
93
  {
94
  "epoch": 7.0,
95
+ "eval_accuracy": 0.9245161290322581,
96
+ "eval_loss": 0.030584245920181274,
97
+ "eval_runtime": 5.5639,
98
+ "eval_samples_per_second": 557.166,
99
+ "eval_steps_per_second": 11.683,
100
  "step": 2226
101
  },
102
  {
103
  "epoch": 7.861635220125786,
104
+ "grad_norm": 0.21355891227722168,
105
  "learning_rate": 4.276729559748428e-06,
106
+ "loss": 0.0474,
107
  "step": 2500
108
  },
109
  {
110
  "epoch": 8.0,
111
+ "eval_accuracy": 0.9251612903225807,
112
+ "eval_loss": 0.028302712365984917,
113
+ "eval_runtime": 5.6339,
114
+ "eval_samples_per_second": 550.241,
115
+ "eval_steps_per_second": 11.537,
116
  "step": 2544
117
  },
118
  {
119
  "epoch": 9.0,
120
+ "eval_accuracy": 0.9270967741935484,
121
+ "eval_loss": 0.027429578825831413,
122
+ "eval_runtime": 5.6922,
123
+ "eval_samples_per_second": 544.6,
124
+ "eval_steps_per_second": 11.419,
125
  "step": 2862
126
  },
127
  {
128
  "epoch": 9.433962264150944,
129
+ "grad_norm": 0.22619187831878662,
130
  "learning_rate": 1.1320754716981133e-06,
131
+ "loss": 0.0438,
132
  "step": 3000
133
  }
134
  ],
 
149
  "attributes": {}
150
  }
151
  },
152
+ "total_flos": 886242059135412.0,
153
  "train_batch_size": 48,
154
  "trial_name": null,
155
  "trial_params": {
156
+ "alpha": 0.14463960620576077,
157
  "num_train_epochs": 10,
158
+ "temperature": 6
159
  }
160
  }
run-0/checkpoint-3180/trainer_state.json CHANGED
@@ -10,125 +10,125 @@
10
  "log_history": [
11
  {
12
  "epoch": 1.0,
13
- "eval_accuracy": 0.5903225806451613,
14
- "eval_loss": 0.19605161249637604,
15
- "eval_runtime": 5.6751,
16
- "eval_samples_per_second": 546.248,
17
- "eval_steps_per_second": 11.454,
18
  "step": 318
19
  },
20
  {
21
  "epoch": 1.5723270440251573,
22
- "grad_norm": 0.4525008201599121,
23
  "learning_rate": 1.685534591194969e-05,
24
- "loss": 0.3162,
25
  "step": 500
26
  },
27
  {
28
  "epoch": 2.0,
29
- "eval_accuracy": 0.827741935483871,
30
- "eval_loss": 0.09383596479892731,
31
- "eval_runtime": 5.6161,
32
- "eval_samples_per_second": 551.986,
33
- "eval_steps_per_second": 11.574,
34
  "step": 636
35
  },
36
  {
37
  "epoch": 3.0,
38
- "eval_accuracy": 0.8870967741935484,
39
- "eval_loss": 0.06216968968510628,
40
- "eval_runtime": 5.3402,
41
- "eval_samples_per_second": 580.504,
42
- "eval_steps_per_second": 12.172,
43
  "step": 954
44
  },
45
  {
46
  "epoch": 3.1446540880503147,
47
- "grad_norm": 0.5138508081436157,
48
  "learning_rate": 1.371069182389937e-05,
49
- "loss": 0.1093,
50
  "step": 1000
51
  },
52
  {
53
  "epoch": 4.0,
54
- "eval_accuracy": 0.8974193548387097,
55
- "eval_loss": 0.04743651673197746,
56
- "eval_runtime": 5.4223,
57
- "eval_samples_per_second": 571.718,
58
- "eval_steps_per_second": 11.988,
59
  "step": 1272
60
  },
61
  {
62
  "epoch": 4.716981132075472,
63
- "grad_norm": 0.3047187328338623,
64
  "learning_rate": 1.0566037735849058e-05,
65
- "loss": 0.0688,
66
  "step": 1500
67
  },
68
  {
69
  "epoch": 5.0,
70
- "eval_accuracy": 0.9132258064516129,
71
- "eval_loss": 0.03883032500743866,
72
- "eval_runtime": 5.5799,
73
- "eval_samples_per_second": 555.57,
74
- "eval_steps_per_second": 11.649,
75
  "step": 1590
76
  },
77
  {
78
  "epoch": 6.0,
79
- "eval_accuracy": 0.917741935483871,
80
- "eval_loss": 0.03374071046710014,
81
- "eval_runtime": 5.3892,
82
- "eval_samples_per_second": 575.22,
83
- "eval_steps_per_second": 12.061,
84
  "step": 1908
85
  },
86
  {
87
  "epoch": 6.289308176100629,
88
- "grad_norm": 0.22965875267982483,
89
  "learning_rate": 7.421383647798742e-06,
90
- "loss": 0.0537,
91
  "step": 2000
92
  },
93
  {
94
  "epoch": 7.0,
95
- "eval_accuracy": 0.9209677419354839,
96
- "eval_loss": 0.03097311407327652,
97
- "eval_runtime": 5.5045,
98
- "eval_samples_per_second": 563.176,
99
- "eval_steps_per_second": 11.809,
100
  "step": 2226
101
  },
102
  {
103
  "epoch": 7.861635220125786,
104
- "grad_norm": 0.2030441015958786,
105
  "learning_rate": 4.276729559748428e-06,
106
- "loss": 0.0464,
107
  "step": 2500
108
  },
109
  {
110
  "epoch": 8.0,
111
- "eval_accuracy": 0.9238709677419354,
112
- "eval_loss": 0.028694752603769302,
113
- "eval_runtime": 5.6886,
114
- "eval_samples_per_second": 544.953,
115
- "eval_steps_per_second": 11.426,
116
  "step": 2544
117
  },
118
  {
119
  "epoch": 9.0,
120
- "eval_accuracy": 0.927741935483871,
121
- "eval_loss": 0.02778397873044014,
122
- "eval_runtime": 5.3317,
123
- "eval_samples_per_second": 581.426,
124
- "eval_steps_per_second": 12.191,
125
  "step": 2862
126
  },
127
  {
128
  "epoch": 9.433962264150944,
129
- "grad_norm": 0.21865826845169067,
130
  "learning_rate": 1.1320754716981133e-06,
131
- "loss": 0.043,
132
  "step": 3000
133
  }
134
  ],
@@ -149,12 +149,12 @@
149
  "attributes": {}
150
  }
151
  },
152
- "total_flos": 780738843279612.0,
153
  "train_batch_size": 48,
154
  "trial_name": null,
155
  "trial_params": {
156
- "alpha": 0.435694601680663,
157
  "num_train_epochs": 10,
158
- "temperature": 12
159
  }
160
  }
 
10
  "log_history": [
11
  {
12
  "epoch": 1.0,
13
+ "eval_accuracy": 0.6106451612903225,
14
+ "eval_loss": 0.2180573046207428,
15
+ "eval_runtime": 5.4236,
16
+ "eval_samples_per_second": 571.576,
17
+ "eval_steps_per_second": 11.985,
18
  "step": 318
19
  },
20
  {
21
  "epoch": 1.5723270440251573,
22
+ "grad_norm": 0.5000836253166199,
23
  "learning_rate": 1.685534591194969e-05,
24
+ "loss": 0.3508,
25
  "step": 500
26
  },
27
  {
28
  "epoch": 2.0,
29
+ "eval_accuracy": 0.8367741935483871,
30
+ "eval_loss": 0.10006564110517502,
31
+ "eval_runtime": 5.9594,
32
+ "eval_samples_per_second": 520.188,
33
+ "eval_steps_per_second": 10.907,
34
  "step": 636
35
  },
36
  {
37
  "epoch": 3.0,
38
+ "eval_accuracy": 0.8880645161290323,
39
+ "eval_loss": 0.06387896835803986,
40
+ "eval_runtime": 5.4554,
41
+ "eval_samples_per_second": 568.241,
42
+ "eval_steps_per_second": 11.915,
43
  "step": 954
44
  },
45
  {
46
  "epoch": 3.1446540880503147,
47
+ "grad_norm": 0.5663716197013855,
48
  "learning_rate": 1.371069182389937e-05,
49
+ "loss": 0.1169,
50
  "step": 1000
51
  },
52
  {
53
  "epoch": 4.0,
54
+ "eval_accuracy": 0.9,
55
+ "eval_loss": 0.04769841209053993,
56
+ "eval_runtime": 5.432,
57
+ "eval_samples_per_second": 570.695,
58
+ "eval_steps_per_second": 11.966,
59
  "step": 1272
60
  },
61
  {
62
  "epoch": 4.716981132075472,
63
+ "grad_norm": 0.3216884136199951,
64
  "learning_rate": 1.0566037735849058e-05,
65
+ "loss": 0.0714,
66
  "step": 1500
67
  },
68
  {
69
  "epoch": 5.0,
70
+ "eval_accuracy": 0.9170967741935484,
71
+ "eval_loss": 0.0384916327893734,
72
+ "eval_runtime": 5.6786,
73
+ "eval_samples_per_second": 545.913,
74
+ "eval_steps_per_second": 11.447,
75
  "step": 1590
76
  },
77
  {
78
  "epoch": 6.0,
79
+ "eval_accuracy": 0.9183870967741935,
80
+ "eval_loss": 0.0333557203412056,
81
+ "eval_runtime": 5.4625,
82
+ "eval_samples_per_second": 567.504,
83
+ "eval_steps_per_second": 11.899,
84
  "step": 1908
85
  },
86
  {
87
  "epoch": 6.289308176100629,
88
+ "grad_norm": 0.24820531904697418,
89
  "learning_rate": 7.421383647798742e-06,
90
+ "loss": 0.055,
91
  "step": 2000
92
  },
93
  {
94
  "epoch": 7.0,
95
+ "eval_accuracy": 0.9245161290322581,
96
+ "eval_loss": 0.030584245920181274,
97
+ "eval_runtime": 5.5639,
98
+ "eval_samples_per_second": 557.166,
99
+ "eval_steps_per_second": 11.683,
100
  "step": 2226
101
  },
102
  {
103
  "epoch": 7.861635220125786,
104
+ "grad_norm": 0.21355891227722168,
105
  "learning_rate": 4.276729559748428e-06,
106
+ "loss": 0.0474,
107
  "step": 2500
108
  },
109
  {
110
  "epoch": 8.0,
111
+ "eval_accuracy": 0.9251612903225807,
112
+ "eval_loss": 0.028302712365984917,
113
+ "eval_runtime": 5.6339,
114
+ "eval_samples_per_second": 550.241,
115
+ "eval_steps_per_second": 11.537,
116
  "step": 2544
117
  },
118
  {
119
  "epoch": 9.0,
120
+ "eval_accuracy": 0.9270967741935484,
121
+ "eval_loss": 0.027429578825831413,
122
+ "eval_runtime": 5.6922,
123
+ "eval_samples_per_second": 544.6,
124
+ "eval_steps_per_second": 11.419,
125
  "step": 2862
126
  },
127
  {
128
  "epoch": 9.433962264150944,
129
+ "grad_norm": 0.22619187831878662,
130
  "learning_rate": 1.1320754716981133e-06,
131
+ "loss": 0.0438,
132
  "step": 3000
133
  }
134
  ],
 
149
  "attributes": {}
150
  }
151
  },
152
+ "total_flos": 886242059135412.0,
153
  "train_batch_size": 48,
154
  "trial_name": null,
155
  "trial_params": {
156
+ "alpha": 0.14463960620576077,
157
  "num_train_epochs": 10,
158
+ "temperature": 6
159
  }
160
  }
runs/Nov25_08-56-26_a78cb449300a/events.out.tfevents.1732527563.a78cb449300a.701.5 CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:a64f52802c4a29ad4d65081f04203af0eb1930054d2699f505e96c9636e8de97
3
- size 30063
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:06181206223d415c8f4cacccfdbc0686a5356fd2f9d84b91484ded39cd6be4fa
3
+ size 31454