jssky commited on
Commit
1984084
·
verified ·
1 Parent(s): 9f49f72

Training in progress, step 420, checkpoint

Browse files
last-checkpoint/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c9d19d032b2720552ed5a8c04c8453d710ed0eed172ae313734cb428d3f003fc
3
  size 80013120
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4b543d9e019f6d5c3cd652914901b2739520d85b9e6044fe4d75f753c8dd4dc9
3
  size 80013120
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:08e5683a29463e32746f14f186f042dd447b12cafcad678bbbddb34b9249098a
3
  size 41120084
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:14c61ad2090140d21c6df9e0a61cd6d3225a4e43d63a4283da89db183775f6ae
3
  size 41120084
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:3369e2942ff752b68da734b9eaf1a12b8c42e1d8b80214950313c71f22a426be
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c08654f8daac8b1091c235d2fb6bd8b249208c723b2dd501bc93b7ca776f4cba
3
  size 14244
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:9fe9c01b8c53647998de80cbc88fe3102f7ee94466c3d3ba6db0d6d4b3bdc06d
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:24b34984058cd5169df3c13d6905d5f65a7c10a7cf4235e831bb570e73473147
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 0.5,
5
  "eval_steps": 140,
6
- "global_step": 280,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -1983,6 +1983,994 @@
1983
  "eval_samples_per_second": 16.888,
1984
  "eval_steps_per_second": 8.444,
1985
  "step": 280
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1986
  }
1987
  ],
1988
  "logging_steps": 1,
@@ -2002,7 +2990,7 @@
2002
  "attributes": {}
2003
  }
2004
  },
2005
- "total_flos": 9.129139501635994e+16,
2006
  "train_batch_size": 2,
2007
  "trial_name": null,
2008
  "trial_params": null
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 0.75,
5
  "eval_steps": 140,
6
+ "global_step": 420,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
1983
  "eval_samples_per_second": 16.888,
1984
  "eval_steps_per_second": 8.444,
1985
  "step": 280
1986
+ },
1987
+ {
1988
+ "epoch": 0.5017857142857143,
1989
+ "grad_norm": 0.3074837923049927,
1990
+ "learning_rate": 0.00010228459587429497,
1991
+ "loss": 1.1389,
1992
+ "step": 281
1993
+ },
1994
+ {
1995
+ "epoch": 0.5035714285714286,
1996
+ "grad_norm": 0.3143484592437744,
1997
+ "learning_rate": 0.00010171351213038993,
1998
+ "loss": 0.9542,
1999
+ "step": 282
2000
+ },
2001
+ {
2002
+ "epoch": 0.5053571428571428,
2003
+ "grad_norm": 0.3524804413318634,
2004
+ "learning_rate": 0.00010114237248023404,
2005
+ "loss": 0.8578,
2006
+ "step": 283
2007
+ },
2008
+ {
2009
+ "epoch": 0.5071428571428571,
2010
+ "grad_norm": 0.335183322429657,
2011
+ "learning_rate": 0.00010057119555823085,
2012
+ "loss": 0.9228,
2013
+ "step": 284
2014
+ },
2015
+ {
2016
+ "epoch": 0.5089285714285714,
2017
+ "grad_norm": 0.35537081956863403,
2018
+ "learning_rate": 0.0001,
2019
+ "loss": 0.9541,
2020
+ "step": 285
2021
+ },
2022
+ {
2023
+ "epoch": 0.5107142857142857,
2024
+ "grad_norm": 0.3294563591480255,
2025
+ "learning_rate": 9.942880444176918e-05,
2026
+ "loss": 1.4223,
2027
+ "step": 286
2028
+ },
2029
+ {
2030
+ "epoch": 0.5125,
2031
+ "grad_norm": 0.32628077268600464,
2032
+ "learning_rate": 9.8857627519766e-05,
2033
+ "loss": 1.2942,
2034
+ "step": 287
2035
+ },
2036
+ {
2037
+ "epoch": 0.5142857142857142,
2038
+ "grad_norm": 0.32195404171943665,
2039
+ "learning_rate": 9.828648786961008e-05,
2040
+ "loss": 1.4239,
2041
+ "step": 288
2042
+ },
2043
+ {
2044
+ "epoch": 0.5160714285714286,
2045
+ "grad_norm": 0.3283804655075073,
2046
+ "learning_rate": 9.771540412570504e-05,
2047
+ "loss": 1.0303,
2048
+ "step": 289
2049
+ },
2050
+ {
2051
+ "epoch": 0.5178571428571429,
2052
+ "grad_norm": 0.356052428483963,
2053
+ "learning_rate": 9.71443949206304e-05,
2054
+ "loss": 1.0199,
2055
+ "step": 290
2056
+ },
2057
+ {
2058
+ "epoch": 0.5196428571428572,
2059
+ "grad_norm": 0.39479124546051025,
2060
+ "learning_rate": 9.657347888453367e-05,
2061
+ "loss": 1.1343,
2062
+ "step": 291
2063
+ },
2064
+ {
2065
+ "epoch": 0.5214285714285715,
2066
+ "grad_norm": 0.34791451692581177,
2067
+ "learning_rate": 9.60026746445227e-05,
2068
+ "loss": 1.521,
2069
+ "step": 292
2070
+ },
2071
+ {
2072
+ "epoch": 0.5232142857142857,
2073
+ "grad_norm": 0.3614530861377716,
2074
+ "learning_rate": 9.543200082405768e-05,
2075
+ "loss": 1.2346,
2076
+ "step": 293
2077
+ },
2078
+ {
2079
+ "epoch": 0.525,
2080
+ "grad_norm": 0.36958596110343933,
2081
+ "learning_rate": 9.486147604234371e-05,
2082
+ "loss": 1.0457,
2083
+ "step": 294
2084
+ },
2085
+ {
2086
+ "epoch": 0.5267857142857143,
2087
+ "grad_norm": 0.4293418824672699,
2088
+ "learning_rate": 9.42911189137232e-05,
2089
+ "loss": 1.1071,
2090
+ "step": 295
2091
+ },
2092
+ {
2093
+ "epoch": 0.5285714285714286,
2094
+ "grad_norm": 0.40408602356910706,
2095
+ "learning_rate": 9.372094804706867e-05,
2096
+ "loss": 1.3805,
2097
+ "step": 296
2098
+ },
2099
+ {
2100
+ "epoch": 0.5303571428571429,
2101
+ "grad_norm": 0.3892784118652344,
2102
+ "learning_rate": 9.315098204517543e-05,
2103
+ "loss": 0.9136,
2104
+ "step": 297
2105
+ },
2106
+ {
2107
+ "epoch": 0.5321428571428571,
2108
+ "grad_norm": 0.4003988206386566,
2109
+ "learning_rate": 9.258123950415479e-05,
2110
+ "loss": 1.3684,
2111
+ "step": 298
2112
+ },
2113
+ {
2114
+ "epoch": 0.5339285714285714,
2115
+ "grad_norm": 0.37116503715515137,
2116
+ "learning_rate": 9.201173901282724e-05,
2117
+ "loss": 1.7824,
2118
+ "step": 299
2119
+ },
2120
+ {
2121
+ "epoch": 0.5357142857142857,
2122
+ "grad_norm": 0.5286569595336914,
2123
+ "learning_rate": 9.144249915211605e-05,
2124
+ "loss": 1.9392,
2125
+ "step": 300
2126
+ },
2127
+ {
2128
+ "epoch": 0.5375,
2129
+ "grad_norm": 0.18338526785373688,
2130
+ "learning_rate": 9.087353849444085e-05,
2131
+ "loss": 1.4422,
2132
+ "step": 301
2133
+ },
2134
+ {
2135
+ "epoch": 0.5392857142857143,
2136
+ "grad_norm": 0.20191389322280884,
2137
+ "learning_rate": 9.030487560311186e-05,
2138
+ "loss": 1.5443,
2139
+ "step": 302
2140
+ },
2141
+ {
2142
+ "epoch": 0.5410714285714285,
2143
+ "grad_norm": 0.19955602288246155,
2144
+ "learning_rate": 8.973652903172423e-05,
2145
+ "loss": 1.6521,
2146
+ "step": 303
2147
+ },
2148
+ {
2149
+ "epoch": 0.5428571428571428,
2150
+ "grad_norm": 0.2253991812467575,
2151
+ "learning_rate": 8.916851732355255e-05,
2152
+ "loss": 1.6596,
2153
+ "step": 304
2154
+ },
2155
+ {
2156
+ "epoch": 0.5446428571428571,
2157
+ "grad_norm": 0.18380412459373474,
2158
+ "learning_rate": 8.860085901094595e-05,
2159
+ "loss": 1.5462,
2160
+ "step": 305
2161
+ },
2162
+ {
2163
+ "epoch": 0.5464285714285714,
2164
+ "grad_norm": 0.21318556368350983,
2165
+ "learning_rate": 8.803357261472343e-05,
2166
+ "loss": 1.6713,
2167
+ "step": 306
2168
+ },
2169
+ {
2170
+ "epoch": 0.5482142857142858,
2171
+ "grad_norm": 0.20480670034885406,
2172
+ "learning_rate": 8.746667664356956e-05,
2173
+ "loss": 1.7537,
2174
+ "step": 307
2175
+ },
2176
+ {
2177
+ "epoch": 0.55,
2178
+ "grad_norm": 0.21598470211029053,
2179
+ "learning_rate": 8.690018959343072e-05,
2180
+ "loss": 1.6955,
2181
+ "step": 308
2182
+ },
2183
+ {
2184
+ "epoch": 0.5517857142857143,
2185
+ "grad_norm": 0.21172207593917847,
2186
+ "learning_rate": 8.633412994691144e-05,
2187
+ "loss": 1.7187,
2188
+ "step": 309
2189
+ },
2190
+ {
2191
+ "epoch": 0.5535714285714286,
2192
+ "grad_norm": 0.22086191177368164,
2193
+ "learning_rate": 8.57685161726715e-05,
2194
+ "loss": 1.7579,
2195
+ "step": 310
2196
+ },
2197
+ {
2198
+ "epoch": 0.5553571428571429,
2199
+ "grad_norm": 0.2266392558813095,
2200
+ "learning_rate": 8.520336672482338e-05,
2201
+ "loss": 1.7486,
2202
+ "step": 311
2203
+ },
2204
+ {
2205
+ "epoch": 0.5571428571428572,
2206
+ "grad_norm": 0.2660948634147644,
2207
+ "learning_rate": 8.463870004233008e-05,
2208
+ "loss": 1.7903,
2209
+ "step": 312
2210
+ },
2211
+ {
2212
+ "epoch": 0.5589285714285714,
2213
+ "grad_norm": 0.2297395020723343,
2214
+ "learning_rate": 8.407453454840357e-05,
2215
+ "loss": 1.8017,
2216
+ "step": 313
2217
+ },
2218
+ {
2219
+ "epoch": 0.5607142857142857,
2220
+ "grad_norm": 0.21526682376861572,
2221
+ "learning_rate": 8.351088864990368e-05,
2222
+ "loss": 1.855,
2223
+ "step": 314
2224
+ },
2225
+ {
2226
+ "epoch": 0.5625,
2227
+ "grad_norm": 0.2516147196292877,
2228
+ "learning_rate": 8.294778073673762e-05,
2229
+ "loss": 1.7103,
2230
+ "step": 315
2231
+ },
2232
+ {
2233
+ "epoch": 0.5642857142857143,
2234
+ "grad_norm": 0.25641700625419617,
2235
+ "learning_rate": 8.238522918125983e-05,
2236
+ "loss": 1.9301,
2237
+ "step": 316
2238
+ },
2239
+ {
2240
+ "epoch": 0.5660714285714286,
2241
+ "grad_norm": 0.26828062534332275,
2242
+ "learning_rate": 8.182325233767267e-05,
2243
+ "loss": 1.8575,
2244
+ "step": 317
2245
+ },
2246
+ {
2247
+ "epoch": 0.5678571428571428,
2248
+ "grad_norm": 0.24787884950637817,
2249
+ "learning_rate": 8.126186854142752e-05,
2250
+ "loss": 2.0228,
2251
+ "step": 318
2252
+ },
2253
+ {
2254
+ "epoch": 0.5696428571428571,
2255
+ "grad_norm": 0.23658955097198486,
2256
+ "learning_rate": 8.070109610862668e-05,
2257
+ "loss": 1.7813,
2258
+ "step": 319
2259
+ },
2260
+ {
2261
+ "epoch": 0.5714285714285714,
2262
+ "grad_norm": 0.2818485498428345,
2263
+ "learning_rate": 8.014095333542548e-05,
2264
+ "loss": 1.7571,
2265
+ "step": 320
2266
+ },
2267
+ {
2268
+ "epoch": 0.5732142857142857,
2269
+ "grad_norm": 0.24982373416423798,
2270
+ "learning_rate": 7.958145849743569e-05,
2271
+ "loss": 1.602,
2272
+ "step": 321
2273
+ },
2274
+ {
2275
+ "epoch": 0.575,
2276
+ "grad_norm": 0.2815864682197571,
2277
+ "learning_rate": 7.902262984912909e-05,
2278
+ "loss": 1.7216,
2279
+ "step": 322
2280
+ },
2281
+ {
2282
+ "epoch": 0.5767857142857142,
2283
+ "grad_norm": 0.2675464451313019,
2284
+ "learning_rate": 7.846448562324183e-05,
2285
+ "loss": 1.0704,
2286
+ "step": 323
2287
+ },
2288
+ {
2289
+ "epoch": 0.5785714285714286,
2290
+ "grad_norm": 0.23938840627670288,
2291
+ "learning_rate": 7.79070440301796e-05,
2292
+ "loss": 1.282,
2293
+ "step": 324
2294
+ },
2295
+ {
2296
+ "epoch": 0.5803571428571429,
2297
+ "grad_norm": 0.2834213972091675,
2298
+ "learning_rate": 7.735032325742355e-05,
2299
+ "loss": 1.7934,
2300
+ "step": 325
2301
+ },
2302
+ {
2303
+ "epoch": 0.5821428571428572,
2304
+ "grad_norm": 0.3555513918399811,
2305
+ "learning_rate": 7.679434146893685e-05,
2306
+ "loss": 1.2089,
2307
+ "step": 326
2308
+ },
2309
+ {
2310
+ "epoch": 0.5839285714285715,
2311
+ "grad_norm": 0.3254348933696747,
2312
+ "learning_rate": 7.623911680457198e-05,
2313
+ "loss": 1.0845,
2314
+ "step": 327
2315
+ },
2316
+ {
2317
+ "epoch": 0.5857142857142857,
2318
+ "grad_norm": 0.3558744192123413,
2319
+ "learning_rate": 7.568466737947905e-05,
2320
+ "loss": 1.2339,
2321
+ "step": 328
2322
+ },
2323
+ {
2324
+ "epoch": 0.5875,
2325
+ "grad_norm": 0.32738080620765686,
2326
+ "learning_rate": 7.513101128351454e-05,
2327
+ "loss": 0.8492,
2328
+ "step": 329
2329
+ },
2330
+ {
2331
+ "epoch": 0.5892857142857143,
2332
+ "grad_norm": 0.36939212679862976,
2333
+ "learning_rate": 7.457816658065134e-05,
2334
+ "loss": 0.8973,
2335
+ "step": 330
2336
+ },
2337
+ {
2338
+ "epoch": 0.5910714285714286,
2339
+ "grad_norm": 0.3393215835094452,
2340
+ "learning_rate": 7.402615130838917e-05,
2341
+ "loss": 1.0078,
2342
+ "step": 331
2343
+ },
2344
+ {
2345
+ "epoch": 0.5928571428571429,
2346
+ "grad_norm": 0.402725487947464,
2347
+ "learning_rate": 7.347498347716624e-05,
2348
+ "loss": 1.2381,
2349
+ "step": 332
2350
+ },
2351
+ {
2352
+ "epoch": 0.5946428571428571,
2353
+ "grad_norm": 0.33164989948272705,
2354
+ "learning_rate": 7.292468106977148e-05,
2355
+ "loss": 1.197,
2356
+ "step": 333
2357
+ },
2358
+ {
2359
+ "epoch": 0.5964285714285714,
2360
+ "grad_norm": 0.3454689681529999,
2361
+ "learning_rate": 7.237526204075797e-05,
2362
+ "loss": 1.0244,
2363
+ "step": 334
2364
+ },
2365
+ {
2366
+ "epoch": 0.5982142857142857,
2367
+ "grad_norm": 0.3584868907928467,
2368
+ "learning_rate": 7.182674431585704e-05,
2369
+ "loss": 1.062,
2370
+ "step": 335
2371
+ },
2372
+ {
2373
+ "epoch": 0.6,
2374
+ "grad_norm": 0.3318020701408386,
2375
+ "learning_rate": 7.127914579139338e-05,
2376
+ "loss": 1.3696,
2377
+ "step": 336
2378
+ },
2379
+ {
2380
+ "epoch": 0.6017857142857143,
2381
+ "grad_norm": 0.38772639632225037,
2382
+ "learning_rate": 7.073248433370124e-05,
2383
+ "loss": 1.1725,
2384
+ "step": 337
2385
+ },
2386
+ {
2387
+ "epoch": 0.6035714285714285,
2388
+ "grad_norm": 0.33527833223342896,
2389
+ "learning_rate": 7.018677777854157e-05,
2390
+ "loss": 1.0849,
2391
+ "step": 338
2392
+ },
2393
+ {
2394
+ "epoch": 0.6053571428571428,
2395
+ "grad_norm": 0.3958960473537445,
2396
+ "learning_rate": 6.964204393051981e-05,
2397
+ "loss": 0.8494,
2398
+ "step": 339
2399
+ },
2400
+ {
2401
+ "epoch": 0.6071428571428571,
2402
+ "grad_norm": 0.3545495569705963,
2403
+ "learning_rate": 6.909830056250527e-05,
2404
+ "loss": 0.843,
2405
+ "step": 340
2406
+ },
2407
+ {
2408
+ "epoch": 0.6089285714285714,
2409
+ "grad_norm": 0.3475857675075531,
2410
+ "learning_rate": 6.855556541505122e-05,
2411
+ "loss": 1.1228,
2412
+ "step": 341
2413
+ },
2414
+ {
2415
+ "epoch": 0.6107142857142858,
2416
+ "grad_norm": 0.4085654616355896,
2417
+ "learning_rate": 6.801385619581592e-05,
2418
+ "loss": 0.8092,
2419
+ "step": 342
2420
+ },
2421
+ {
2422
+ "epoch": 0.6125,
2423
+ "grad_norm": 0.4605577886104584,
2424
+ "learning_rate": 6.747319057898503e-05,
2425
+ "loss": 1.0999,
2426
+ "step": 343
2427
+ },
2428
+ {
2429
+ "epoch": 0.6142857142857143,
2430
+ "grad_norm": 0.3840469717979431,
2431
+ "learning_rate": 6.693358620469487e-05,
2432
+ "loss": 1.2712,
2433
+ "step": 344
2434
+ },
2435
+ {
2436
+ "epoch": 0.6160714285714286,
2437
+ "grad_norm": 0.3712100684642792,
2438
+ "learning_rate": 6.639506067845697e-05,
2439
+ "loss": 1.1401,
2440
+ "step": 345
2441
+ },
2442
+ {
2443
+ "epoch": 0.6178571428571429,
2444
+ "grad_norm": 0.41670674085617065,
2445
+ "learning_rate": 6.585763157058358e-05,
2446
+ "loss": 1.151,
2447
+ "step": 346
2448
+ },
2449
+ {
2450
+ "epoch": 0.6196428571428572,
2451
+ "grad_norm": 0.5912812352180481,
2452
+ "learning_rate": 6.53213164156144e-05,
2453
+ "loss": 1.447,
2454
+ "step": 347
2455
+ },
2456
+ {
2457
+ "epoch": 0.6214285714285714,
2458
+ "grad_norm": 0.3995843231678009,
2459
+ "learning_rate": 6.478613271174453e-05,
2460
+ "loss": 1.6645,
2461
+ "step": 348
2462
+ },
2463
+ {
2464
+ "epoch": 0.6232142857142857,
2465
+ "grad_norm": 0.5337942242622375,
2466
+ "learning_rate": 6.425209792025358e-05,
2467
+ "loss": 1.8703,
2468
+ "step": 349
2469
+ },
2470
+ {
2471
+ "epoch": 0.625,
2472
+ "grad_norm": 0.5726771354675293,
2473
+ "learning_rate": 6.371922946493591e-05,
2474
+ "loss": 1.9795,
2475
+ "step": 350
2476
+ },
2477
+ {
2478
+ "epoch": 0.6267857142857143,
2479
+ "grad_norm": 0.17994743585586548,
2480
+ "learning_rate": 6.318754473153221e-05,
2481
+ "loss": 1.5463,
2482
+ "step": 351
2483
+ },
2484
+ {
2485
+ "epoch": 0.6285714285714286,
2486
+ "grad_norm": 0.16961722075939178,
2487
+ "learning_rate": 6.26570610671622e-05,
2488
+ "loss": 1.5388,
2489
+ "step": 352
2490
+ },
2491
+ {
2492
+ "epoch": 0.6303571428571428,
2493
+ "grad_norm": 0.19410116970539093,
2494
+ "learning_rate": 6.21277957797587e-05,
2495
+ "loss": 1.7282,
2496
+ "step": 353
2497
+ },
2498
+ {
2499
+ "epoch": 0.6321428571428571,
2500
+ "grad_norm": 0.1906638890504837,
2501
+ "learning_rate": 6.159976613750286e-05,
2502
+ "loss": 1.5167,
2503
+ "step": 354
2504
+ },
2505
+ {
2506
+ "epoch": 0.6339285714285714,
2507
+ "grad_norm": 0.21918904781341553,
2508
+ "learning_rate": 6.107298936826086e-05,
2509
+ "loss": 1.7446,
2510
+ "step": 355
2511
+ },
2512
+ {
2513
+ "epoch": 0.6357142857142857,
2514
+ "grad_norm": 0.19429104030132294,
2515
+ "learning_rate": 6.0547482659021706e-05,
2516
+ "loss": 1.7166,
2517
+ "step": 356
2518
+ },
2519
+ {
2520
+ "epoch": 0.6375,
2521
+ "grad_norm": 0.21244099736213684,
2522
+ "learning_rate": 6.002326315533665e-05,
2523
+ "loss": 1.7319,
2524
+ "step": 357
2525
+ },
2526
+ {
2527
+ "epoch": 0.6392857142857142,
2528
+ "grad_norm": 0.22793884575366974,
2529
+ "learning_rate": 5.950034796075947e-05,
2530
+ "loss": 1.6573,
2531
+ "step": 358
2532
+ },
2533
+ {
2534
+ "epoch": 0.6410714285714286,
2535
+ "grad_norm": 0.23558048903942108,
2536
+ "learning_rate": 5.897875413628884e-05,
2537
+ "loss": 1.7359,
2538
+ "step": 359
2539
+ },
2540
+ {
2541
+ "epoch": 0.6428571428571429,
2542
+ "grad_norm": 0.21951396763324738,
2543
+ "learning_rate": 5.845849869981137e-05,
2544
+ "loss": 1.6536,
2545
+ "step": 360
2546
+ },
2547
+ {
2548
+ "epoch": 0.6446428571428572,
2549
+ "grad_norm": 0.22028475999832153,
2550
+ "learning_rate": 5.793959862554652e-05,
2551
+ "loss": 1.7257,
2552
+ "step": 361
2553
+ },
2554
+ {
2555
+ "epoch": 0.6464285714285715,
2556
+ "grad_norm": 0.23070622980594635,
2557
+ "learning_rate": 5.7422070843492734e-05,
2558
+ "loss": 1.6149,
2559
+ "step": 362
2560
+ },
2561
+ {
2562
+ "epoch": 0.6482142857142857,
2563
+ "grad_norm": 0.22408127784729004,
2564
+ "learning_rate": 5.6905932238875123e-05,
2565
+ "loss": 1.5936,
2566
+ "step": 363
2567
+ },
2568
+ {
2569
+ "epoch": 0.65,
2570
+ "grad_norm": 0.23098969459533691,
2571
+ "learning_rate": 5.639119965159446e-05,
2572
+ "loss": 1.7313,
2573
+ "step": 364
2574
+ },
2575
+ {
2576
+ "epoch": 0.6517857142857143,
2577
+ "grad_norm": 0.2533377707004547,
2578
+ "learning_rate": 5.5877889875677845e-05,
2579
+ "loss": 1.9422,
2580
+ "step": 365
2581
+ },
2582
+ {
2583
+ "epoch": 0.6535714285714286,
2584
+ "grad_norm": 0.2515897750854492,
2585
+ "learning_rate": 5.5366019658730825e-05,
2586
+ "loss": 1.8331,
2587
+ "step": 366
2588
+ },
2589
+ {
2590
+ "epoch": 0.6553571428571429,
2591
+ "grad_norm": 0.2757863700389862,
2592
+ "learning_rate": 5.485560570139061e-05,
2593
+ "loss": 1.7759,
2594
+ "step": 367
2595
+ },
2596
+ {
2597
+ "epoch": 0.6571428571428571,
2598
+ "grad_norm": 0.29774826765060425,
2599
+ "learning_rate": 5.434666465678175e-05,
2600
+ "loss": 1.334,
2601
+ "step": 368
2602
+ },
2603
+ {
2604
+ "epoch": 0.6589285714285714,
2605
+ "grad_norm": 0.28664523363113403,
2606
+ "learning_rate": 5.383921312997242e-05,
2607
+ "loss": 1.6234,
2608
+ "step": 369
2609
+ },
2610
+ {
2611
+ "epoch": 0.6607142857142857,
2612
+ "grad_norm": 0.30774933099746704,
2613
+ "learning_rate": 5.333326767743263e-05,
2614
+ "loss": 1.4743,
2615
+ "step": 370
2616
+ },
2617
+ {
2618
+ "epoch": 0.6625,
2619
+ "grad_norm": 0.26094669103622437,
2620
+ "learning_rate": 5.282884480649435e-05,
2621
+ "loss": 1.1046,
2622
+ "step": 371
2623
+ },
2624
+ {
2625
+ "epoch": 0.6642857142857143,
2626
+ "grad_norm": 0.2818247079849243,
2627
+ "learning_rate": 5.232596097481251e-05,
2628
+ "loss": 0.8417,
2629
+ "step": 372
2630
+ },
2631
+ {
2632
+ "epoch": 0.6660714285714285,
2633
+ "grad_norm": 0.3089035749435425,
2634
+ "learning_rate": 5.182463258982846e-05,
2635
+ "loss": 1.3922,
2636
+ "step": 373
2637
+ },
2638
+ {
2639
+ "epoch": 0.6678571428571428,
2640
+ "grad_norm": 0.2375502586364746,
2641
+ "learning_rate": 5.132487600823438e-05,
2642
+ "loss": 1.0855,
2643
+ "step": 374
2644
+ },
2645
+ {
2646
+ "epoch": 0.6696428571428571,
2647
+ "grad_norm": 0.3417636454105377,
2648
+ "learning_rate": 5.082670753543961e-05,
2649
+ "loss": 1.0819,
2650
+ "step": 375
2651
+ },
2652
+ {
2653
+ "epoch": 0.6714285714285714,
2654
+ "grad_norm": 0.2587840259075165,
2655
+ "learning_rate": 5.033014342503889e-05,
2656
+ "loss": 1.1154,
2657
+ "step": 376
2658
+ },
2659
+ {
2660
+ "epoch": 0.6732142857142858,
2661
+ "grad_norm": 0.29829278588294983,
2662
+ "learning_rate": 4.9835199878281765e-05,
2663
+ "loss": 0.9634,
2664
+ "step": 377
2665
+ },
2666
+ {
2667
+ "epoch": 0.675,
2668
+ "grad_norm": 0.307190477848053,
2669
+ "learning_rate": 4.9341893043544185e-05,
2670
+ "loss": 1.1533,
2671
+ "step": 378
2672
+ },
2673
+ {
2674
+ "epoch": 0.6767857142857143,
2675
+ "grad_norm": 0.3548847734928131,
2676
+ "learning_rate": 4.8850239015801625e-05,
2677
+ "loss": 1.2046,
2678
+ "step": 379
2679
+ },
2680
+ {
2681
+ "epoch": 0.6785714285714286,
2682
+ "grad_norm": 0.3130282759666443,
2683
+ "learning_rate": 4.836025383610382e-05,
2684
+ "loss": 1.1391,
2685
+ "step": 380
2686
+ },
2687
+ {
2688
+ "epoch": 0.6803571428571429,
2689
+ "grad_norm": 0.3400501012802124,
2690
+ "learning_rate": 4.787195349105159e-05,
2691
+ "loss": 1.0226,
2692
+ "step": 381
2693
+ },
2694
+ {
2695
+ "epoch": 0.6821428571428572,
2696
+ "grad_norm": 0.3462565541267395,
2697
+ "learning_rate": 4.7385353912275165e-05,
2698
+ "loss": 1.0968,
2699
+ "step": 382
2700
+ },
2701
+ {
2702
+ "epoch": 0.6839285714285714,
2703
+ "grad_norm": 0.331123948097229,
2704
+ "learning_rate": 4.690047097591427e-05,
2705
+ "loss": 1.1918,
2706
+ "step": 383
2707
+ },
2708
+ {
2709
+ "epoch": 0.6857142857142857,
2710
+ "grad_norm": 0.354432612657547,
2711
+ "learning_rate": 4.6417320502100316e-05,
2712
+ "loss": 1.2261,
2713
+ "step": 384
2714
+ },
2715
+ {
2716
+ "epoch": 0.6875,
2717
+ "grad_norm": 0.34844398498535156,
2718
+ "learning_rate": 4.593591825444028e-05,
2719
+ "loss": 0.7991,
2720
+ "step": 385
2721
+ },
2722
+ {
2723
+ "epoch": 0.6892857142857143,
2724
+ "grad_norm": 0.3462367653846741,
2725
+ "learning_rate": 4.545627993950201e-05,
2726
+ "loss": 1.1343,
2727
+ "step": 386
2728
+ },
2729
+ {
2730
+ "epoch": 0.6910714285714286,
2731
+ "grad_norm": 0.3352709114551544,
2732
+ "learning_rate": 4.497842120630229e-05,
2733
+ "loss": 1.1023,
2734
+ "step": 387
2735
+ },
2736
+ {
2737
+ "epoch": 0.6928571428571428,
2738
+ "grad_norm": 0.3581717610359192,
2739
+ "learning_rate": 4.4502357645795976e-05,
2740
+ "loss": 0.9181,
2741
+ "step": 388
2742
+ },
2743
+ {
2744
+ "epoch": 0.6946428571428571,
2745
+ "grad_norm": 0.35995498299598694,
2746
+ "learning_rate": 4.402810479036725e-05,
2747
+ "loss": 1.3445,
2748
+ "step": 389
2749
+ },
2750
+ {
2751
+ "epoch": 0.6964285714285714,
2752
+ "grad_norm": 0.372935950756073,
2753
+ "learning_rate": 4.355567811332311e-05,
2754
+ "loss": 1.0725,
2755
+ "step": 390
2756
+ },
2757
+ {
2758
+ "epoch": 0.6982142857142857,
2759
+ "grad_norm": 0.3759123980998993,
2760
+ "learning_rate": 4.30850930283882e-05,
2761
+ "loss": 1.1824,
2762
+ "step": 391
2763
+ },
2764
+ {
2765
+ "epoch": 0.7,
2766
+ "grad_norm": 0.391770601272583,
2767
+ "learning_rate": 4.2616364889202254e-05,
2768
+ "loss": 1.3516,
2769
+ "step": 392
2770
+ },
2771
+ {
2772
+ "epoch": 0.7017857142857142,
2773
+ "grad_norm": 0.3785625696182251,
2774
+ "learning_rate": 4.214950898881892e-05,
2775
+ "loss": 1.1624,
2776
+ "step": 393
2777
+ },
2778
+ {
2779
+ "epoch": 0.7035714285714286,
2780
+ "grad_norm": 0.4284125864505768,
2781
+ "learning_rate": 4.168454055920681e-05,
2782
+ "loss": 1.1318,
2783
+ "step": 394
2784
+ },
2785
+ {
2786
+ "epoch": 0.7053571428571429,
2787
+ "grad_norm": 0.391161173582077,
2788
+ "learning_rate": 4.12214747707527e-05,
2789
+ "loss": 1.4543,
2790
+ "step": 395
2791
+ },
2792
+ {
2793
+ "epoch": 0.7071428571428572,
2794
+ "grad_norm": 0.3723802864551544,
2795
+ "learning_rate": 4.0760326731766374e-05,
2796
+ "loss": 1.4265,
2797
+ "step": 396
2798
+ },
2799
+ {
2800
+ "epoch": 0.7089285714285715,
2801
+ "grad_norm": 0.4321235418319702,
2802
+ "learning_rate": 4.030111148798775e-05,
2803
+ "loss": 1.4523,
2804
+ "step": 397
2805
+ },
2806
+ {
2807
+ "epoch": 0.7107142857142857,
2808
+ "grad_norm": 0.43963977694511414,
2809
+ "learning_rate": 3.9843844022096135e-05,
2810
+ "loss": 1.7426,
2811
+ "step": 398
2812
+ },
2813
+ {
2814
+ "epoch": 0.7125,
2815
+ "grad_norm": 0.5444914698600769,
2816
+ "learning_rate": 3.938853925322118e-05,
2817
+ "loss": 1.8907,
2818
+ "step": 399
2819
+ },
2820
+ {
2821
+ "epoch": 0.7142857142857143,
2822
+ "grad_norm": 0.9059175252914429,
2823
+ "learning_rate": 3.893521203645618e-05,
2824
+ "loss": 2.2147,
2825
+ "step": 400
2826
+ },
2827
+ {
2828
+ "epoch": 0.7160714285714286,
2829
+ "grad_norm": 0.20250235497951508,
2830
+ "learning_rate": 3.848387716237353e-05,
2831
+ "loss": 1.7341,
2832
+ "step": 401
2833
+ },
2834
+ {
2835
+ "epoch": 0.7178571428571429,
2836
+ "grad_norm": 0.1863853931427002,
2837
+ "learning_rate": 3.8034549356541894e-05,
2838
+ "loss": 1.6956,
2839
+ "step": 402
2840
+ },
2841
+ {
2842
+ "epoch": 0.7196428571428571,
2843
+ "grad_norm": 0.19317291676998138,
2844
+ "learning_rate": 3.7587243279046056e-05,
2845
+ "loss": 1.7165,
2846
+ "step": 403
2847
+ },
2848
+ {
2849
+ "epoch": 0.7214285714285714,
2850
+ "grad_norm": 0.21101966500282288,
2851
+ "learning_rate": 3.714197352400849e-05,
2852
+ "loss": 1.8306,
2853
+ "step": 404
2854
+ },
2855
+ {
2856
+ "epoch": 0.7232142857142857,
2857
+ "grad_norm": 0.22385361790657043,
2858
+ "learning_rate": 3.669875461911297e-05,
2859
+ "loss": 1.7104,
2860
+ "step": 405
2861
+ },
2862
+ {
2863
+ "epoch": 0.725,
2864
+ "grad_norm": 0.22555914521217346,
2865
+ "learning_rate": 3.6257601025131026e-05,
2866
+ "loss": 1.5668,
2867
+ "step": 406
2868
+ },
2869
+ {
2870
+ "epoch": 0.7267857142857143,
2871
+ "grad_norm": 0.21916812658309937,
2872
+ "learning_rate": 3.581852713544983e-05,
2873
+ "loss": 1.7827,
2874
+ "step": 407
2875
+ },
2876
+ {
2877
+ "epoch": 0.7285714285714285,
2878
+ "grad_norm": 0.23447498679161072,
2879
+ "learning_rate": 3.538154727560259e-05,
2880
+ "loss": 1.8308,
2881
+ "step": 408
2882
+ },
2883
+ {
2884
+ "epoch": 0.7303571428571428,
2885
+ "grad_norm": 0.21024593710899353,
2886
+ "learning_rate": 3.494667570280132e-05,
2887
+ "loss": 1.613,
2888
+ "step": 409
2889
+ },
2890
+ {
2891
+ "epoch": 0.7321428571428571,
2892
+ "grad_norm": 0.23882578313350677,
2893
+ "learning_rate": 3.45139266054715e-05,
2894
+ "loss": 1.6853,
2895
+ "step": 410
2896
+ },
2897
+ {
2898
+ "epoch": 0.7339285714285714,
2899
+ "grad_norm": 0.23162604868412018,
2900
+ "learning_rate": 3.408331410278929e-05,
2901
+ "loss": 1.7371,
2902
+ "step": 411
2903
+ },
2904
+ {
2905
+ "epoch": 0.7357142857142858,
2906
+ "grad_norm": 0.23150567710399628,
2907
+ "learning_rate": 3.3654852244220826e-05,
2908
+ "loss": 1.7505,
2909
+ "step": 412
2910
+ },
2911
+ {
2912
+ "epoch": 0.7375,
2913
+ "grad_norm": 0.23027552664279938,
2914
+ "learning_rate": 3.322855500906373e-05,
2915
+ "loss": 1.7128,
2916
+ "step": 413
2917
+ },
2918
+ {
2919
+ "epoch": 0.7392857142857143,
2920
+ "grad_norm": 0.22426114976406097,
2921
+ "learning_rate": 3.2804436305991214e-05,
2922
+ "loss": 1.7721,
2923
+ "step": 414
2924
+ },
2925
+ {
2926
+ "epoch": 0.7410714285714286,
2927
+ "grad_norm": 0.22792723774909973,
2928
+ "learning_rate": 3.238250997259808e-05,
2929
+ "loss": 1.7089,
2930
+ "step": 415
2931
+ },
2932
+ {
2933
+ "epoch": 0.7428571428571429,
2934
+ "grad_norm": 0.2450588494539261,
2935
+ "learning_rate": 3.196278977494934e-05,
2936
+ "loss": 1.744,
2937
+ "step": 416
2938
+ },
2939
+ {
2940
+ "epoch": 0.7446428571428572,
2941
+ "grad_norm": 0.2348526120185852,
2942
+ "learning_rate": 3.154528940713113e-05,
2943
+ "loss": 1.8496,
2944
+ "step": 417
2945
+ },
2946
+ {
2947
+ "epoch": 0.7464285714285714,
2948
+ "grad_norm": 0.2519124746322632,
2949
+ "learning_rate": 3.113002249080386e-05,
2950
+ "loss": 1.76,
2951
+ "step": 418
2952
+ },
2953
+ {
2954
+ "epoch": 0.7482142857142857,
2955
+ "grad_norm": 0.27859431505203247,
2956
+ "learning_rate": 3.071700257475768e-05,
2957
+ "loss": 1.6493,
2958
+ "step": 419
2959
+ },
2960
+ {
2961
+ "epoch": 0.75,
2962
+ "grad_norm": 0.2539427876472473,
2963
+ "learning_rate": 3.030624313447067e-05,
2964
+ "loss": 1.7619,
2965
+ "step": 420
2966
+ },
2967
+ {
2968
+ "epoch": 0.75,
2969
+ "eval_loss": 1.4375773668289185,
2970
+ "eval_runtime": 13.3809,
2971
+ "eval_samples_per_second": 17.637,
2972
+ "eval_steps_per_second": 8.819,
2973
+ "step": 420
2974
  }
2975
  ],
2976
  "logging_steps": 1,
 
2990
  "attributes": {}
2991
  }
2992
  },
2993
+ "total_flos": 1.3697781125559091e+17,
2994
  "train_batch_size": 2,
2995
  "trial_name": null,
2996
  "trial_params": null