Rubywong123 commited on
Commit
f16d196
·
verified ·
1 Parent(s): e9edfcb

Upload folder using huggingface_hub

Browse files
all_results.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "epoch": 1.9915254237288136,
3
- "total_flos": 2.0040717157217075e+17,
4
- "train_loss": 0.3583613080091966,
5
- "train_runtime": 1046.9285,
6
  "train_samples": 1886,
7
- "train_samples_per_second": 3.603,
8
- "train_steps_per_second": 0.075
9
  }
 
1
  {
2
  "epoch": 1.9915254237288136,
3
+ "total_flos": 1.99474976032555e+17,
4
+ "train_loss": 0.34869656616296524,
5
+ "train_runtime": 1209.7175,
6
  "train_samples": 1886,
7
+ "train_samples_per_second": 3.118,
8
+ "train_steps_per_second": 0.064
9
  }
checkpoint-78/global_step78/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e706c0f6ba1146d24b0d603079e53bd473f0b2a326a13d8e956e9c2e1112e629
3
  size 24090788620
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7e19f69cd69f06c047b856af37fa26be7ecc2f1652ffed106e4ed67de201a92e
3
  size 24090788620
checkpoint-78/global_step78/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:706e7871adce3e6888b154ca136c3ceb5c250427de657bfc6c7720e531243275
3
  size 24090788620
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:74860641fa0d8b9a1875f73da806d2c7699a33ee66d53e11e3b4fd59fb9d3d3f
3
  size 24090788620
checkpoint-78/global_step78/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:6bbffc12416615fc410d75419c6d3a8ec95587ad55f2b97872d20b914e7f48d6
3
  size 24090788620
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1726f0da5f7454310ac5250cf76bc31467df4c7396e597cf9090a298a620af37
3
  size 24090788620
checkpoint-78/global_step78/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:fd785912a6abdbe045de567e5c1c2adb0c193ff25406eb6a5490792eaa13666b
3
  size 24090788620
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:44b6735808c7da1530a9462e9dbb4fde337a22ce7ebebb500a37a81cf5b63a3b
3
  size 24090788620
checkpoint-78/model-00001-of-00004.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:82d7684f0bc2d9ba8cae8438d38b9f13df930e561a00db485fb33be89e83a0cc
3
  size 4976698672
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9463134d24a68e5467a165936344f7888de6dc726a027ff8087ca17573d59ed8
3
  size 4976698672
checkpoint-78/model-00002-of-00004.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:fcb4cbe4ed92182813741623598059e78fc5b0db59dce56cfa210a0fc186e7ca
3
  size 4999802720
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:82ed7d48cd123c89198e1f24f44b6af5b86f4f420c1b7eb7ade81652ce1e6dab
3
  size 4999802720
checkpoint-78/model-00003-of-00004.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:88bdabd82844250933c777821ffa80dd245c0c9c8c553092254832b17d6e9cd9
3
  size 4915916176
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b6e803792750c32787a675e62d8189bb5df6a8122a89ce8df44fe3aedef0a169
3
  size 4915916176
checkpoint-78/model-00004-of-00004.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:300bcb73b19b2c48b147c91dfc95b728037c5278340cbae67b3793e4168300d9
3
  size 1168138808
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a8447e4e7a0ae4a548b4b3a321fe1b5da3db47b3ff391b973c498853fa3ee701
3
  size 1168138808
checkpoint-78/trainer_state.json CHANGED
@@ -10,130 +10,130 @@
10
  "log_history": [
11
  {
12
  "epoch": 0.025423728813559324,
13
- "grad_norm": 0.0730147390450625,
14
  "learning_rate": 1.25e-06,
15
- "loss": 0.4558,
16
  "step": 1
17
  },
18
  {
19
  "epoch": 0.1271186440677966,
20
- "grad_norm": 0.06534838050795798,
21
  "learning_rate": 6.25e-06,
22
- "loss": 0.4283,
23
  "step": 5
24
  },
25
  {
26
  "epoch": 0.2542372881355932,
27
- "grad_norm": 0.07880361896680486,
28
  "learning_rate": 9.979871469976197e-06,
29
- "loss": 0.4333,
30
  "step": 10
31
  },
32
  {
33
  "epoch": 0.3813559322033898,
34
- "grad_norm": 0.06167813403187308,
35
  "learning_rate": 9.755282581475769e-06,
36
- "loss": 0.3957,
37
  "step": 15
38
  },
39
  {
40
  "epoch": 0.5084745762711864,
41
- "grad_norm": 0.056345965720785644,
42
  "learning_rate": 9.292243968009332e-06,
43
- "loss": 0.3871,
44
  "step": 20
45
  },
46
  {
47
  "epoch": 0.635593220338983,
48
- "grad_norm": 0.05709095573219508,
49
  "learning_rate": 8.613974319136959e-06,
50
- "loss": 0.4074,
51
  "step": 25
52
  },
53
  {
54
  "epoch": 0.7627118644067796,
55
- "grad_norm": 0.05325720652524892,
56
  "learning_rate": 7.754484907260513e-06,
57
- "loss": 0.3923,
58
  "step": 30
59
  },
60
  {
61
  "epoch": 0.8898305084745762,
62
- "grad_norm": 0.06496552156375579,
63
  "learning_rate": 6.7568741204067145e-06,
64
- "loss": 0.3506,
65
  "step": 35
66
  },
67
  {
68
  "epoch": 0.9915254237288136,
69
- "eval_loss": 0.42708608508110046,
70
- "eval_runtime": 28.3391,
71
- "eval_samples_per_second": 19.373,
72
- "eval_steps_per_second": 4.87,
73
  "step": 39
74
  },
75
  {
76
  "epoch": 1.0254237288135593,
77
- "grad_norm": 0.11980842959729002,
78
  "learning_rate": 5.671166329088278e-06,
79
- "loss": 0.4393,
80
  "step": 40
81
  },
82
  {
83
  "epoch": 1.152542372881356,
84
- "grad_norm": 0.05843339680839595,
85
  "learning_rate": 4.551803455482833e-06,
86
- "loss": 0.3204,
87
  "step": 45
88
  },
89
  {
90
  "epoch": 1.2796610169491525,
91
- "grad_norm": 0.054919903247120906,
92
  "learning_rate": 3.4549150281252635e-06,
93
- "loss": 0.3555,
94
  "step": 50
95
  },
96
  {
97
  "epoch": 1.4067796610169492,
98
- "grad_norm": 0.07085936292685689,
99
  "learning_rate": 2.43550361297047e-06,
100
- "loss": 0.3271,
101
  "step": 55
102
  },
103
  {
104
  "epoch": 1.5338983050847457,
105
- "grad_norm": 0.06803901954956897,
106
  "learning_rate": 1.544686755065677e-06,
107
- "loss": 0.3118,
108
  "step": 60
109
  },
110
  {
111
  "epoch": 1.6610169491525424,
112
- "grad_norm": 0.06855321202228915,
113
  "learning_rate": 8.271337313934869e-07,
114
- "loss": 0.2804,
115
  "step": 65
116
  },
117
  {
118
  "epoch": 1.788135593220339,
119
- "grad_norm": 0.06749004000856927,
120
  "learning_rate": 3.18825646801314e-07,
121
- "loss": 0.2871,
122
  "step": 70
123
  },
124
  {
125
  "epoch": 1.9152542372881356,
126
- "grad_norm": 0.06909307377658395,
127
  "learning_rate": 4.52511911603265e-08,
128
- "loss": 0.2896,
129
  "step": 75
130
  },
131
  {
132
  "epoch": 1.9915254237288136,
133
- "eval_loss": 0.43976882100105286,
134
- "eval_runtime": 28.393,
135
- "eval_samples_per_second": 19.336,
136
- "eval_steps_per_second": 4.86,
137
  "step": 78
138
  }
139
  ],
@@ -154,7 +154,7 @@
154
  "attributes": {}
155
  }
156
  },
157
- "total_flos": 2.0040717157217075e+17,
158
  "train_batch_size": 1,
159
  "trial_name": null,
160
  "trial_params": null
 
10
  "log_history": [
11
  {
12
  "epoch": 0.025423728813559324,
13
+ "grad_norm": 0.08035527647801548,
14
  "learning_rate": 1.25e-06,
15
+ "loss": 0.4007,
16
  "step": 1
17
  },
18
  {
19
  "epoch": 0.1271186440677966,
20
+ "grad_norm": 0.05733009561118041,
21
  "learning_rate": 6.25e-06,
22
+ "loss": 0.4134,
23
  "step": 5
24
  },
25
  {
26
  "epoch": 0.2542372881355932,
27
+ "grad_norm": 0.061316952771613246,
28
  "learning_rate": 9.979871469976197e-06,
29
+ "loss": 0.4061,
30
  "step": 10
31
  },
32
  {
33
  "epoch": 0.3813559322033898,
34
+ "grad_norm": 0.0706914793637617,
35
  "learning_rate": 9.755282581475769e-06,
36
+ "loss": 0.4018,
37
  "step": 15
38
  },
39
  {
40
  "epoch": 0.5084745762711864,
41
+ "grad_norm": 0.06306250853537879,
42
  "learning_rate": 9.292243968009332e-06,
43
+ "loss": 0.4068,
44
  "step": 20
45
  },
46
  {
47
  "epoch": 0.635593220338983,
48
+ "grad_norm": 0.05396739437655817,
49
  "learning_rate": 8.613974319136959e-06,
50
+ "loss": 0.388,
51
  "step": 25
52
  },
53
  {
54
  "epoch": 0.7627118644067796,
55
+ "grad_norm": 0.055943979296443166,
56
  "learning_rate": 7.754484907260513e-06,
57
+ "loss": 0.3379,
58
  "step": 30
59
  },
60
  {
61
  "epoch": 0.8898305084745762,
62
+ "grad_norm": 0.06643011122771461,
63
  "learning_rate": 6.7568741204067145e-06,
64
+ "loss": 0.3801,
65
  "step": 35
66
  },
67
  {
68
  "epoch": 0.9915254237288136,
69
+ "eval_loss": 0.42687076330184937,
70
+ "eval_runtime": 29.2208,
71
+ "eval_samples_per_second": 18.788,
72
+ "eval_steps_per_second": 4.723,
73
  "step": 39
74
  },
75
  {
76
  "epoch": 1.0254237288135593,
77
+ "grad_norm": 0.1257204507458321,
78
  "learning_rate": 5.671166329088278e-06,
79
+ "loss": 0.4294,
80
  "step": 40
81
  },
82
  {
83
  "epoch": 1.152542372881356,
84
+ "grad_norm": 0.061247984812326635,
85
  "learning_rate": 4.551803455482833e-06,
86
+ "loss": 0.3162,
87
  "step": 45
88
  },
89
  {
90
  "epoch": 1.2796610169491525,
91
+ "grad_norm": 0.06297833414755911,
92
  "learning_rate": 3.4549150281252635e-06,
93
+ "loss": 0.3145,
94
  "step": 50
95
  },
96
  {
97
  "epoch": 1.4067796610169492,
98
+ "grad_norm": 0.06687382967843461,
99
  "learning_rate": 2.43550361297047e-06,
100
+ "loss": 0.2941,
101
  "step": 55
102
  },
103
  {
104
  "epoch": 1.5338983050847457,
105
+ "grad_norm": 0.06683369207674608,
106
  "learning_rate": 1.544686755065677e-06,
107
+ "loss": 0.3145,
108
  "step": 60
109
  },
110
  {
111
  "epoch": 1.6610169491525424,
112
+ "grad_norm": 0.0813495468013659,
113
  "learning_rate": 8.271337313934869e-07,
114
+ "loss": 0.2945,
115
  "step": 65
116
  },
117
  {
118
  "epoch": 1.788135593220339,
119
+ "grad_norm": 0.07371530295761193,
120
  "learning_rate": 3.18825646801314e-07,
121
+ "loss": 0.3056,
122
  "step": 70
123
  },
124
  {
125
  "epoch": 1.9152542372881356,
126
+ "grad_norm": 0.06581699726337101,
127
  "learning_rate": 4.52511911603265e-08,
128
+ "loss": 0.2775,
129
  "step": 75
130
  },
131
  {
132
  "epoch": 1.9915254237288136,
133
+ "eval_loss": 0.443694531917572,
134
+ "eval_runtime": 28.5624,
135
+ "eval_samples_per_second": 19.221,
136
+ "eval_steps_per_second": 4.832,
137
  "step": 78
138
  }
139
  ],
 
154
  "attributes": {}
155
  }
156
  },
157
+ "total_flos": 1.99474976032555e+17,
158
  "train_batch_size": 1,
159
  "trial_name": null,
160
  "trial_params": null
checkpoint-78/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:dc225b746e1882b49dcf5edd66225f1ecc9a54067aa2a93b6ed09a0bda7fe700
3
  size 7352
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1506ee53478326afd61ccdceba54438180344163122931a3bb2d342d659bade0
3
  size 7352
model-00001-of-00004.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:82d7684f0bc2d9ba8cae8438d38b9f13df930e561a00db485fb33be89e83a0cc
3
  size 4976698672
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9463134d24a68e5467a165936344f7888de6dc726a027ff8087ca17573d59ed8
3
  size 4976698672
model-00002-of-00004.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:fcb4cbe4ed92182813741623598059e78fc5b0db59dce56cfa210a0fc186e7ca
3
  size 4999802720
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:82ed7d48cd123c89198e1f24f44b6af5b86f4f420c1b7eb7ade81652ce1e6dab
3
  size 4999802720
model-00003-of-00004.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:88bdabd82844250933c777821ffa80dd245c0c9c8c553092254832b17d6e9cd9
3
  size 4915916176
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b6e803792750c32787a675e62d8189bb5df6a8122a89ce8df44fe3aedef0a169
3
  size 4915916176
model-00004-of-00004.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:300bcb73b19b2c48b147c91dfc95b728037c5278340cbae67b3793e4168300d9
3
  size 1168138808
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a8447e4e7a0ae4a548b4b3a321fe1b5da3db47b3ff391b973c498853fa3ee701
3
  size 1168138808
train_results.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "epoch": 1.9915254237288136,
3
- "total_flos": 2.0040717157217075e+17,
4
- "train_loss": 0.3583613080091966,
5
- "train_runtime": 1046.9285,
6
  "train_samples": 1886,
7
- "train_samples_per_second": 3.603,
8
- "train_steps_per_second": 0.075
9
  }
 
1
  {
2
  "epoch": 1.9915254237288136,
3
+ "total_flos": 1.99474976032555e+17,
4
+ "train_loss": 0.34869656616296524,
5
+ "train_runtime": 1209.7175,
6
  "train_samples": 1886,
7
+ "train_samples_per_second": 3.118,
8
+ "train_steps_per_second": 0.064
9
  }
trainer_state.json CHANGED
@@ -10,140 +10,140 @@
10
  "log_history": [
11
  {
12
  "epoch": 0.025423728813559324,
13
- "grad_norm": 0.0730147390450625,
14
  "learning_rate": 1.25e-06,
15
- "loss": 0.4558,
16
  "step": 1
17
  },
18
  {
19
  "epoch": 0.1271186440677966,
20
- "grad_norm": 0.06534838050795798,
21
  "learning_rate": 6.25e-06,
22
- "loss": 0.4283,
23
  "step": 5
24
  },
25
  {
26
  "epoch": 0.2542372881355932,
27
- "grad_norm": 0.07880361896680486,
28
  "learning_rate": 9.979871469976197e-06,
29
- "loss": 0.4333,
30
  "step": 10
31
  },
32
  {
33
  "epoch": 0.3813559322033898,
34
- "grad_norm": 0.06167813403187308,
35
  "learning_rate": 9.755282581475769e-06,
36
- "loss": 0.3957,
37
  "step": 15
38
  },
39
  {
40
  "epoch": 0.5084745762711864,
41
- "grad_norm": 0.056345965720785644,
42
  "learning_rate": 9.292243968009332e-06,
43
- "loss": 0.3871,
44
  "step": 20
45
  },
46
  {
47
  "epoch": 0.635593220338983,
48
- "grad_norm": 0.05709095573219508,
49
  "learning_rate": 8.613974319136959e-06,
50
- "loss": 0.4074,
51
  "step": 25
52
  },
53
  {
54
  "epoch": 0.7627118644067796,
55
- "grad_norm": 0.05325720652524892,
56
  "learning_rate": 7.754484907260513e-06,
57
- "loss": 0.3923,
58
  "step": 30
59
  },
60
  {
61
  "epoch": 0.8898305084745762,
62
- "grad_norm": 0.06496552156375579,
63
  "learning_rate": 6.7568741204067145e-06,
64
- "loss": 0.3506,
65
  "step": 35
66
  },
67
  {
68
  "epoch": 0.9915254237288136,
69
- "eval_loss": 0.42708608508110046,
70
- "eval_runtime": 28.3391,
71
- "eval_samples_per_second": 19.373,
72
- "eval_steps_per_second": 4.87,
73
  "step": 39
74
  },
75
  {
76
  "epoch": 1.0254237288135593,
77
- "grad_norm": 0.11980842959729002,
78
  "learning_rate": 5.671166329088278e-06,
79
- "loss": 0.4393,
80
  "step": 40
81
  },
82
  {
83
  "epoch": 1.152542372881356,
84
- "grad_norm": 0.05843339680839595,
85
  "learning_rate": 4.551803455482833e-06,
86
- "loss": 0.3204,
87
  "step": 45
88
  },
89
  {
90
  "epoch": 1.2796610169491525,
91
- "grad_norm": 0.054919903247120906,
92
  "learning_rate": 3.4549150281252635e-06,
93
- "loss": 0.3555,
94
  "step": 50
95
  },
96
  {
97
  "epoch": 1.4067796610169492,
98
- "grad_norm": 0.07085936292685689,
99
  "learning_rate": 2.43550361297047e-06,
100
- "loss": 0.3271,
101
  "step": 55
102
  },
103
  {
104
  "epoch": 1.5338983050847457,
105
- "grad_norm": 0.06803901954956897,
106
  "learning_rate": 1.544686755065677e-06,
107
- "loss": 0.3118,
108
  "step": 60
109
  },
110
  {
111
  "epoch": 1.6610169491525424,
112
- "grad_norm": 0.06855321202228915,
113
  "learning_rate": 8.271337313934869e-07,
114
- "loss": 0.2804,
115
  "step": 65
116
  },
117
  {
118
  "epoch": 1.788135593220339,
119
- "grad_norm": 0.06749004000856927,
120
  "learning_rate": 3.18825646801314e-07,
121
- "loss": 0.2871,
122
  "step": 70
123
  },
124
  {
125
  "epoch": 1.9152542372881356,
126
- "grad_norm": 0.06909307377658395,
127
  "learning_rate": 4.52511911603265e-08,
128
- "loss": 0.2896,
129
  "step": 75
130
  },
131
  {
132
  "epoch": 1.9915254237288136,
133
- "eval_loss": 0.43976882100105286,
134
- "eval_runtime": 28.393,
135
- "eval_samples_per_second": 19.336,
136
- "eval_steps_per_second": 4.86,
137
  "step": 78
138
  },
139
  {
140
  "epoch": 1.9915254237288136,
141
  "step": 78,
142
- "total_flos": 2.0040717157217075e+17,
143
- "train_loss": 0.3583613080091966,
144
- "train_runtime": 1046.9285,
145
- "train_samples_per_second": 3.603,
146
- "train_steps_per_second": 0.075
147
  }
148
  ],
149
  "logging_steps": 5,
@@ -163,7 +163,7 @@
163
  "attributes": {}
164
  }
165
  },
166
- "total_flos": 2.0040717157217075e+17,
167
  "train_batch_size": 1,
168
  "trial_name": null,
169
  "trial_params": null
 
10
  "log_history": [
11
  {
12
  "epoch": 0.025423728813559324,
13
+ "grad_norm": 0.08035527647801548,
14
  "learning_rate": 1.25e-06,
15
+ "loss": 0.4007,
16
  "step": 1
17
  },
18
  {
19
  "epoch": 0.1271186440677966,
20
+ "grad_norm": 0.05733009561118041,
21
  "learning_rate": 6.25e-06,
22
+ "loss": 0.4134,
23
  "step": 5
24
  },
25
  {
26
  "epoch": 0.2542372881355932,
27
+ "grad_norm": 0.061316952771613246,
28
  "learning_rate": 9.979871469976197e-06,
29
+ "loss": 0.4061,
30
  "step": 10
31
  },
32
  {
33
  "epoch": 0.3813559322033898,
34
+ "grad_norm": 0.0706914793637617,
35
  "learning_rate": 9.755282581475769e-06,
36
+ "loss": 0.4018,
37
  "step": 15
38
  },
39
  {
40
  "epoch": 0.5084745762711864,
41
+ "grad_norm": 0.06306250853537879,
42
  "learning_rate": 9.292243968009332e-06,
43
+ "loss": 0.4068,
44
  "step": 20
45
  },
46
  {
47
  "epoch": 0.635593220338983,
48
+ "grad_norm": 0.05396739437655817,
49
  "learning_rate": 8.613974319136959e-06,
50
+ "loss": 0.388,
51
  "step": 25
52
  },
53
  {
54
  "epoch": 0.7627118644067796,
55
+ "grad_norm": 0.055943979296443166,
56
  "learning_rate": 7.754484907260513e-06,
57
+ "loss": 0.3379,
58
  "step": 30
59
  },
60
  {
61
  "epoch": 0.8898305084745762,
62
+ "grad_norm": 0.06643011122771461,
63
  "learning_rate": 6.7568741204067145e-06,
64
+ "loss": 0.3801,
65
  "step": 35
66
  },
67
  {
68
  "epoch": 0.9915254237288136,
69
+ "eval_loss": 0.42687076330184937,
70
+ "eval_runtime": 29.2208,
71
+ "eval_samples_per_second": 18.788,
72
+ "eval_steps_per_second": 4.723,
73
  "step": 39
74
  },
75
  {
76
  "epoch": 1.0254237288135593,
77
+ "grad_norm": 0.1257204507458321,
78
  "learning_rate": 5.671166329088278e-06,
79
+ "loss": 0.4294,
80
  "step": 40
81
  },
82
  {
83
  "epoch": 1.152542372881356,
84
+ "grad_norm": 0.061247984812326635,
85
  "learning_rate": 4.551803455482833e-06,
86
+ "loss": 0.3162,
87
  "step": 45
88
  },
89
  {
90
  "epoch": 1.2796610169491525,
91
+ "grad_norm": 0.06297833414755911,
92
  "learning_rate": 3.4549150281252635e-06,
93
+ "loss": 0.3145,
94
  "step": 50
95
  },
96
  {
97
  "epoch": 1.4067796610169492,
98
+ "grad_norm": 0.06687382967843461,
99
  "learning_rate": 2.43550361297047e-06,
100
+ "loss": 0.2941,
101
  "step": 55
102
  },
103
  {
104
  "epoch": 1.5338983050847457,
105
+ "grad_norm": 0.06683369207674608,
106
  "learning_rate": 1.544686755065677e-06,
107
+ "loss": 0.3145,
108
  "step": 60
109
  },
110
  {
111
  "epoch": 1.6610169491525424,
112
+ "grad_norm": 0.0813495468013659,
113
  "learning_rate": 8.271337313934869e-07,
114
+ "loss": 0.2945,
115
  "step": 65
116
  },
117
  {
118
  "epoch": 1.788135593220339,
119
+ "grad_norm": 0.07371530295761193,
120
  "learning_rate": 3.18825646801314e-07,
121
+ "loss": 0.3056,
122
  "step": 70
123
  },
124
  {
125
  "epoch": 1.9152542372881356,
126
+ "grad_norm": 0.06581699726337101,
127
  "learning_rate": 4.52511911603265e-08,
128
+ "loss": 0.2775,
129
  "step": 75
130
  },
131
  {
132
  "epoch": 1.9915254237288136,
133
+ "eval_loss": 0.443694531917572,
134
+ "eval_runtime": 28.5624,
135
+ "eval_samples_per_second": 19.221,
136
+ "eval_steps_per_second": 4.832,
137
  "step": 78
138
  },
139
  {
140
  "epoch": 1.9915254237288136,
141
  "step": 78,
142
+ "total_flos": 1.99474976032555e+17,
143
+ "train_loss": 0.34869656616296524,
144
+ "train_runtime": 1209.7175,
145
+ "train_samples_per_second": 3.118,
146
+ "train_steps_per_second": 0.064
147
  }
148
  ],
149
  "logging_steps": 5,
 
163
  "attributes": {}
164
  }
165
  },
166
+ "total_flos": 1.99474976032555e+17,
167
  "train_batch_size": 1,
168
  "trial_name": null,
169
  "trial_params": null
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:dc225b746e1882b49dcf5edd66225f1ecc9a54067aa2a93b6ed09a0bda7fe700
3
  size 7352
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1506ee53478326afd61ccdceba54438180344163122931a3bb2d342d659bade0
3
  size 7352