fats-fme commited on
Commit
ea6d694
·
verified ·
1 Parent(s): 36267a3

Training in progress, step 423, checkpoint

Browse files
last-checkpoint/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:41c04db13401440bb120e3569a23dbda67cd78267d7c0b1c77f3d3b3cee4cdee
3
  size 101752088
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:efedcd7712efe5df4242d40d0fc157567550dc57198de0fde11a067a253c3786
3
  size 101752088
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f1a032c7471714a5d4a253e904e854da99f9722e45c96bc0da82257681a15490
3
  size 203713238
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2a3093ef84d124bf4f3a388a3f58cedd89b5fbf3ec80a866e1189f65649a0f5e
3
  size 203713238
last-checkpoint/rng_state_0.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:45144a3e80d33a7835b701c1b7b63faebde586b75158a47eb826cd0228136ec0
3
  size 14512
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9e52b4ddcd925a725a65812af6610fe4debc708c6e4fc1ee7e0e17160e2a6fc5
3
  size 14512
last-checkpoint/rng_state_1.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:feb6925b0db33b6f02f0ccbd50be336d8d47178a933641d2c637051d854a6c60
3
  size 14512
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7d9aa8c4c4812086f9a0cd74c7d98dc727224f492c2c8deb8168a9fa04e2846e
3
  size 14512
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c28833a5c9fe2e108390575900c0ade8d470ff95484328f12052b199c28b6360
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b2bb049f58262ac24b66ea8e4bbb35c588cda72b0f20c7495d16197e65e5d114
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 0.5007769145394007,
5
  "eval_steps": 141,
6
- "global_step": 282,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -2005,6 +2005,1001 @@
2005
  "eval_samples_per_second": 5.989,
2006
  "eval_steps_per_second": 1.502,
2007
  "step": 282
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2008
  }
2009
  ],
2010
  "logging_steps": 1,
@@ -2024,7 +3019,7 @@
2024
  "attributes": {}
2025
  }
2026
  },
2027
- "total_flos": 3.7141566324945715e+17,
2028
  "train_batch_size": 2,
2029
  "trial_name": null,
2030
  "trial_params": null
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 0.751165371809101,
5
  "eval_steps": 141,
6
+ "global_step": 423,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
2005
  "eval_samples_per_second": 5.989,
2006
  "eval_steps_per_second": 1.502,
2007
  "step": 282
2008
+ },
2009
+ {
2010
+ "epoch": 0.502552719200888,
2011
+ "grad_norm": 0.331061989068985,
2012
+ "learning_rate": 0.00011434168642236964,
2013
+ "loss": 0.8114,
2014
+ "step": 283
2015
+ },
2016
+ {
2017
+ "epoch": 0.5043285238623751,
2018
+ "grad_norm": 0.3186919689178467,
2019
+ "learning_rate": 0.00011373535578184082,
2020
+ "loss": 0.7872,
2021
+ "step": 284
2022
+ },
2023
+ {
2024
+ "epoch": 0.5061043285238623,
2025
+ "grad_norm": 0.3114188611507416,
2026
+ "learning_rate": 0.00011312851002705383,
2027
+ "loss": 0.7311,
2028
+ "step": 285
2029
+ },
2030
+ {
2031
+ "epoch": 0.5078801331853496,
2032
+ "grad_norm": 0.3148879408836365,
2033
+ "learning_rate": 0.00011252117191642175,
2034
+ "loss": 0.7311,
2035
+ "step": 286
2036
+ },
2037
+ {
2038
+ "epoch": 0.5096559378468368,
2039
+ "grad_norm": 0.3390887379646301,
2040
+ "learning_rate": 0.00011191336422682237,
2041
+ "loss": 0.7773,
2042
+ "step": 287
2043
+ },
2044
+ {
2045
+ "epoch": 0.5114317425083241,
2046
+ "grad_norm": 0.31982842087745667,
2047
+ "learning_rate": 0.00011130510975274409,
2048
+ "loss": 0.7474,
2049
+ "step": 288
2050
+ },
2051
+ {
2052
+ "epoch": 0.5132075471698113,
2053
+ "grad_norm": 0.31643104553222656,
2054
+ "learning_rate": 0.00011069643130543084,
2055
+ "loss": 0.7375,
2056
+ "step": 289
2057
+ },
2058
+ {
2059
+ "epoch": 0.5149833518312985,
2060
+ "grad_norm": 0.33758479356765747,
2061
+ "learning_rate": 0.00011008735171202684,
2062
+ "loss": 0.7411,
2063
+ "step": 290
2064
+ },
2065
+ {
2066
+ "epoch": 0.5167591564927858,
2067
+ "grad_norm": 0.324556440114975,
2068
+ "learning_rate": 0.00010947789381472035,
2069
+ "loss": 0.7235,
2070
+ "step": 291
2071
+ },
2072
+ {
2073
+ "epoch": 0.518534961154273,
2074
+ "grad_norm": 0.3768496513366699,
2075
+ "learning_rate": 0.00010886808046988717,
2076
+ "loss": 0.7618,
2077
+ "step": 292
2078
+ },
2079
+ {
2080
+ "epoch": 0.5203107658157603,
2081
+ "grad_norm": 0.34034618735313416,
2082
+ "learning_rate": 0.00010825793454723325,
2083
+ "loss": 0.7426,
2084
+ "step": 293
2085
+ },
2086
+ {
2087
+ "epoch": 0.5220865704772475,
2088
+ "grad_norm": 0.3409979045391083,
2089
+ "learning_rate": 0.00010764747892893723,
2090
+ "loss": 0.7327,
2091
+ "step": 294
2092
+ },
2093
+ {
2094
+ "epoch": 0.5238623751387348,
2095
+ "grad_norm": 0.35839787125587463,
2096
+ "learning_rate": 0.00010703673650879218,
2097
+ "loss": 0.7057,
2098
+ "step": 295
2099
+ },
2100
+ {
2101
+ "epoch": 0.525638179800222,
2102
+ "grad_norm": 0.3807874023914337,
2103
+ "learning_rate": 0.00010642573019134703,
2104
+ "loss": 0.7225,
2105
+ "step": 296
2106
+ },
2107
+ {
2108
+ "epoch": 0.5274139844617092,
2109
+ "grad_norm": 0.4682140648365021,
2110
+ "learning_rate": 0.00010581448289104758,
2111
+ "loss": 0.715,
2112
+ "step": 297
2113
+ },
2114
+ {
2115
+ "epoch": 0.5291897891231965,
2116
+ "grad_norm": 0.4261273145675659,
2117
+ "learning_rate": 0.00010520301753137724,
2118
+ "loss": 0.7239,
2119
+ "step": 298
2120
+ },
2121
+ {
2122
+ "epoch": 0.5309655937846837,
2123
+ "grad_norm": 0.4854682981967926,
2124
+ "learning_rate": 0.00010459135704399718,
2125
+ "loss": 0.7304,
2126
+ "step": 299
2127
+ },
2128
+ {
2129
+ "epoch": 0.532741398446171,
2130
+ "grad_norm": 0.6740989685058594,
2131
+ "learning_rate": 0.00010397952436788642,
2132
+ "loss": 0.8604,
2133
+ "step": 300
2134
+ },
2135
+ {
2136
+ "epoch": 0.5345172031076582,
2137
+ "grad_norm": 0.2903907299041748,
2138
+ "learning_rate": 0.00010336754244848157,
2139
+ "loss": 0.8551,
2140
+ "step": 301
2141
+ },
2142
+ {
2143
+ "epoch": 0.5362930077691453,
2144
+ "grad_norm": 0.28648582100868225,
2145
+ "learning_rate": 0.00010275543423681621,
2146
+ "loss": 0.7958,
2147
+ "step": 302
2148
+ },
2149
+ {
2150
+ "epoch": 0.5380688124306326,
2151
+ "grad_norm": 0.33123767375946045,
2152
+ "learning_rate": 0.00010214322268866032,
2153
+ "loss": 0.7853,
2154
+ "step": 303
2155
+ },
2156
+ {
2157
+ "epoch": 0.5398446170921198,
2158
+ "grad_norm": 0.31327784061431885,
2159
+ "learning_rate": 0.00010153093076365923,
2160
+ "loss": 0.7856,
2161
+ "step": 304
2162
+ },
2163
+ {
2164
+ "epoch": 0.5416204217536071,
2165
+ "grad_norm": 0.3101854622364044,
2166
+ "learning_rate": 0.00010091858142447265,
2167
+ "loss": 0.7694,
2168
+ "step": 305
2169
+ },
2170
+ {
2171
+ "epoch": 0.5433962264150943,
2172
+ "grad_norm": 0.3217926621437073,
2173
+ "learning_rate": 0.00010030619763591347,
2174
+ "loss": 0.7899,
2175
+ "step": 306
2176
+ },
2177
+ {
2178
+ "epoch": 0.5451720310765816,
2179
+ "grad_norm": 0.33827194571495056,
2180
+ "learning_rate": 9.969380236408656e-05,
2181
+ "loss": 0.8088,
2182
+ "step": 307
2183
+ },
2184
+ {
2185
+ "epoch": 0.5469478357380688,
2186
+ "grad_norm": 0.32632124423980713,
2187
+ "learning_rate": 9.908141857552737e-05,
2188
+ "loss": 0.769,
2189
+ "step": 308
2190
+ },
2191
+ {
2192
+ "epoch": 0.548723640399556,
2193
+ "grad_norm": 0.3152617812156677,
2194
+ "learning_rate": 9.846906923634079e-05,
2195
+ "loss": 0.7804,
2196
+ "step": 309
2197
+ },
2198
+ {
2199
+ "epoch": 0.5504994450610433,
2200
+ "grad_norm": 0.33337536454200745,
2201
+ "learning_rate": 9.78567773113397e-05,
2202
+ "loss": 0.7379,
2203
+ "step": 310
2204
+ },
2205
+ {
2206
+ "epoch": 0.5522752497225305,
2207
+ "grad_norm": 0.3020349144935608,
2208
+ "learning_rate": 9.724456576318381e-05,
2209
+ "loss": 0.7146,
2210
+ "step": 311
2211
+ },
2212
+ {
2213
+ "epoch": 0.5540510543840178,
2214
+ "grad_norm": 0.34656378626823425,
2215
+ "learning_rate": 9.663245755151846e-05,
2216
+ "loss": 0.7437,
2217
+ "step": 312
2218
+ },
2219
+ {
2220
+ "epoch": 0.555826859045505,
2221
+ "grad_norm": 0.3417186737060547,
2222
+ "learning_rate": 9.602047563211359e-05,
2223
+ "loss": 0.7472,
2224
+ "step": 313
2225
+ },
2226
+ {
2227
+ "epoch": 0.5576026637069922,
2228
+ "grad_norm": 0.34442222118377686,
2229
+ "learning_rate": 9.540864295600283e-05,
2230
+ "loss": 0.7426,
2231
+ "step": 314
2232
+ },
2233
+ {
2234
+ "epoch": 0.5593784683684795,
2235
+ "grad_norm": 0.3521478772163391,
2236
+ "learning_rate": 9.479698246862276e-05,
2237
+ "loss": 0.7522,
2238
+ "step": 315
2239
+ },
2240
+ {
2241
+ "epoch": 0.5611542730299667,
2242
+ "grad_norm": 0.3358227014541626,
2243
+ "learning_rate": 9.418551710895243e-05,
2244
+ "loss": 0.7454,
2245
+ "step": 316
2246
+ },
2247
+ {
2248
+ "epoch": 0.562930077691454,
2249
+ "grad_norm": 0.343226820230484,
2250
+ "learning_rate": 9.357426980865301e-05,
2251
+ "loss": 0.7341,
2252
+ "step": 317
2253
+ },
2254
+ {
2255
+ "epoch": 0.5647058823529412,
2256
+ "grad_norm": 0.3432699739933014,
2257
+ "learning_rate": 9.296326349120785e-05,
2258
+ "loss": 0.6836,
2259
+ "step": 318
2260
+ },
2261
+ {
2262
+ "epoch": 0.5664816870144284,
2263
+ "grad_norm": 0.3710852265357971,
2264
+ "learning_rate": 9.235252107106279e-05,
2265
+ "loss": 0.6961,
2266
+ "step": 319
2267
+ },
2268
+ {
2269
+ "epoch": 0.5682574916759157,
2270
+ "grad_norm": 0.351094514131546,
2271
+ "learning_rate": 9.174206545276677e-05,
2272
+ "loss": 0.6668,
2273
+ "step": 320
2274
+ },
2275
+ {
2276
+ "epoch": 0.5700332963374029,
2277
+ "grad_norm": 0.4484163224697113,
2278
+ "learning_rate": 9.113191953011287e-05,
2279
+ "loss": 0.7427,
2280
+ "step": 321
2281
+ },
2282
+ {
2283
+ "epoch": 0.5718091009988902,
2284
+ "grad_norm": 0.44636109471321106,
2285
+ "learning_rate": 9.052210618527966e-05,
2286
+ "loss": 0.8119,
2287
+ "step": 322
2288
+ },
2289
+ {
2290
+ "epoch": 0.5735849056603773,
2291
+ "grad_norm": 0.43749314546585083,
2292
+ "learning_rate": 8.991264828797319e-05,
2293
+ "loss": 0.7846,
2294
+ "step": 323
2295
+ },
2296
+ {
2297
+ "epoch": 0.5753607103218646,
2298
+ "grad_norm": 0.4471510350704193,
2299
+ "learning_rate": 8.930356869456919e-05,
2300
+ "loss": 0.7215,
2301
+ "step": 324
2302
+ },
2303
+ {
2304
+ "epoch": 0.5771365149833518,
2305
+ "grad_norm": 0.5141078233718872,
2306
+ "learning_rate": 8.869489024725595e-05,
2307
+ "loss": 0.7492,
2308
+ "step": 325
2309
+ },
2310
+ {
2311
+ "epoch": 0.578912319644839,
2312
+ "grad_norm": 0.2640296518802643,
2313
+ "learning_rate": 8.808663577317764e-05,
2314
+ "loss": 0.8625,
2315
+ "step": 326
2316
+ },
2317
+ {
2318
+ "epoch": 0.5806881243063263,
2319
+ "grad_norm": 0.28867048025131226,
2320
+ "learning_rate": 8.747882808357828e-05,
2321
+ "loss": 0.8352,
2322
+ "step": 327
2323
+ },
2324
+ {
2325
+ "epoch": 0.5824639289678135,
2326
+ "grad_norm": 0.2925030589103699,
2327
+ "learning_rate": 8.687148997294621e-05,
2328
+ "loss": 0.8091,
2329
+ "step": 328
2330
+ },
2331
+ {
2332
+ "epoch": 0.5842397336293008,
2333
+ "grad_norm": 0.28383681178092957,
2334
+ "learning_rate": 8.626464421815919e-05,
2335
+ "loss": 0.784,
2336
+ "step": 329
2337
+ },
2338
+ {
2339
+ "epoch": 0.586015538290788,
2340
+ "grad_norm": 0.3055633306503296,
2341
+ "learning_rate": 8.565831357763039e-05,
2342
+ "loss": 0.79,
2343
+ "step": 330
2344
+ },
2345
+ {
2346
+ "epoch": 0.5877913429522752,
2347
+ "grad_norm": 0.30299943685531616,
2348
+ "learning_rate": 8.505252079045458e-05,
2349
+ "loss": 0.8105,
2350
+ "step": 331
2351
+ },
2352
+ {
2353
+ "epoch": 0.5895671476137625,
2354
+ "grad_norm": 0.3154890239238739,
2355
+ "learning_rate": 8.444728857555572e-05,
2356
+ "loss": 0.7664,
2357
+ "step": 332
2358
+ },
2359
+ {
2360
+ "epoch": 0.5913429522752497,
2361
+ "grad_norm": 0.31844133138656616,
2362
+ "learning_rate": 8.384263963083453e-05,
2363
+ "loss": 0.7709,
2364
+ "step": 333
2365
+ },
2366
+ {
2367
+ "epoch": 0.593118756936737,
2368
+ "grad_norm": 0.31844353675842285,
2369
+ "learning_rate": 8.323859663231768e-05,
2370
+ "loss": 0.7426,
2371
+ "step": 334
2372
+ },
2373
+ {
2374
+ "epoch": 0.5948945615982242,
2375
+ "grad_norm": 0.31527841091156006,
2376
+ "learning_rate": 8.263518223330697e-05,
2377
+ "loss": 0.7441,
2378
+ "step": 335
2379
+ },
2380
+ {
2381
+ "epoch": 0.5966703662597115,
2382
+ "grad_norm": 0.32145699858665466,
2383
+ "learning_rate": 8.203241906353014e-05,
2384
+ "loss": 0.7333,
2385
+ "step": 336
2386
+ },
2387
+ {
2388
+ "epoch": 0.5984461709211987,
2389
+ "grad_norm": 0.3175109922885895,
2390
+ "learning_rate": 8.143032972829183e-05,
2391
+ "loss": 0.7488,
2392
+ "step": 337
2393
+ },
2394
+ {
2395
+ "epoch": 0.6002219755826859,
2396
+ "grad_norm": 0.3342651128768921,
2397
+ "learning_rate": 8.082893680762619e-05,
2398
+ "loss": 0.7265,
2399
+ "step": 338
2400
+ },
2401
+ {
2402
+ "epoch": 0.6019977802441732,
2403
+ "grad_norm": 0.339743971824646,
2404
+ "learning_rate": 8.022826285544968e-05,
2405
+ "loss": 0.7005,
2406
+ "step": 339
2407
+ },
2408
+ {
2409
+ "epoch": 0.6037735849056604,
2410
+ "grad_norm": 0.35757359862327576,
2411
+ "learning_rate": 7.96283303987156e-05,
2412
+ "loss": 0.7806,
2413
+ "step": 340
2414
+ },
2415
+ {
2416
+ "epoch": 0.6055493895671477,
2417
+ "grad_norm": 0.4024328291416168,
2418
+ "learning_rate": 7.902916193656898e-05,
2419
+ "loss": 0.6895,
2420
+ "step": 341
2421
+ },
2422
+ {
2423
+ "epoch": 0.6073251942286348,
2424
+ "grad_norm": 0.3628247380256653,
2425
+ "learning_rate": 7.843077993950302e-05,
2426
+ "loss": 0.7285,
2427
+ "step": 342
2428
+ },
2429
+ {
2430
+ "epoch": 0.609100998890122,
2431
+ "grad_norm": 0.3793889582157135,
2432
+ "learning_rate": 7.783320684851614e-05,
2433
+ "loss": 0.729,
2434
+ "step": 343
2435
+ },
2436
+ {
2437
+ "epoch": 0.6108768035516093,
2438
+ "grad_norm": 0.37614578008651733,
2439
+ "learning_rate": 7.72364650742707e-05,
2440
+ "loss": 0.6869,
2441
+ "step": 344
2442
+ },
2443
+ {
2444
+ "epoch": 0.6126526082130965,
2445
+ "grad_norm": 0.3737132251262665,
2446
+ "learning_rate": 7.664057699625214e-05,
2447
+ "loss": 0.7373,
2448
+ "step": 345
2449
+ },
2450
+ {
2451
+ "epoch": 0.6144284128745838,
2452
+ "grad_norm": 0.40523961186408997,
2453
+ "learning_rate": 7.604556496193015e-05,
2454
+ "loss": 0.729,
2455
+ "step": 346
2456
+ },
2457
+ {
2458
+ "epoch": 0.616204217536071,
2459
+ "grad_norm": 0.3903469145298004,
2460
+ "learning_rate": 7.54514512859201e-05,
2461
+ "loss": 0.7063,
2462
+ "step": 347
2463
+ },
2464
+ {
2465
+ "epoch": 0.6179800221975583,
2466
+ "grad_norm": 0.43782973289489746,
2467
+ "learning_rate": 7.485825824914659e-05,
2468
+ "loss": 0.6763,
2469
+ "step": 348
2470
+ },
2471
+ {
2472
+ "epoch": 0.6197558268590455,
2473
+ "grad_norm": 0.4907206594944,
2474
+ "learning_rate": 7.426600809800752e-05,
2475
+ "loss": 0.7405,
2476
+ "step": 349
2477
+ },
2478
+ {
2479
+ "epoch": 0.6215316315205327,
2480
+ "grad_norm": 0.5378274917602539,
2481
+ "learning_rate": 7.36747230435401e-05,
2482
+ "loss": 0.7417,
2483
+ "step": 350
2484
+ },
2485
+ {
2486
+ "epoch": 0.62330743618202,
2487
+ "grad_norm": 0.266481876373291,
2488
+ "learning_rate": 7.308442526058756e-05,
2489
+ "loss": 0.8434,
2490
+ "step": 351
2491
+ },
2492
+ {
2493
+ "epoch": 0.6250832408435072,
2494
+ "grad_norm": 0.28670433163642883,
2495
+ "learning_rate": 7.249513688696786e-05,
2496
+ "loss": 0.8049,
2497
+ "step": 352
2498
+ },
2499
+ {
2500
+ "epoch": 0.6268590455049945,
2501
+ "grad_norm": 0.29961690306663513,
2502
+ "learning_rate": 7.190688002264308e-05,
2503
+ "loss": 0.762,
2504
+ "step": 353
2505
+ },
2506
+ {
2507
+ "epoch": 0.6286348501664817,
2508
+ "grad_norm": 0.2873949706554413,
2509
+ "learning_rate": 7.131967672889101e-05,
2510
+ "loss": 0.7389,
2511
+ "step": 354
2512
+ },
2513
+ {
2514
+ "epoch": 0.6304106548279689,
2515
+ "grad_norm": 0.3315136730670929,
2516
+ "learning_rate": 7.073354902747741e-05,
2517
+ "loss": 0.7719,
2518
+ "step": 355
2519
+ },
2520
+ {
2521
+ "epoch": 0.6321864594894562,
2522
+ "grad_norm": 0.31057095527648926,
2523
+ "learning_rate": 7.014851889983057e-05,
2524
+ "loss": 0.7407,
2525
+ "step": 356
2526
+ },
2527
+ {
2528
+ "epoch": 0.6339622641509434,
2529
+ "grad_norm": 0.345838725566864,
2530
+ "learning_rate": 6.95646082862164e-05,
2531
+ "loss": 0.7838,
2532
+ "step": 357
2533
+ },
2534
+ {
2535
+ "epoch": 0.6357380688124307,
2536
+ "grad_norm": 0.31915196776390076,
2537
+ "learning_rate": 6.898183908491617e-05,
2538
+ "loss": 0.7591,
2539
+ "step": 358
2540
+ },
2541
+ {
2542
+ "epoch": 0.6375138734739179,
2543
+ "grad_norm": 0.3124110698699951,
2544
+ "learning_rate": 6.840023315140475e-05,
2545
+ "loss": 0.7222,
2546
+ "step": 359
2547
+ },
2548
+ {
2549
+ "epoch": 0.6392896781354052,
2550
+ "grad_norm": 0.3307512104511261,
2551
+ "learning_rate": 6.781981229753145e-05,
2552
+ "loss": 0.7472,
2553
+ "step": 360
2554
+ },
2555
+ {
2556
+ "epoch": 0.6410654827968923,
2557
+ "grad_norm": 0.3425205945968628,
2558
+ "learning_rate": 6.724059829070158e-05,
2559
+ "loss": 0.764,
2560
+ "step": 361
2561
+ },
2562
+ {
2563
+ "epoch": 0.6428412874583795,
2564
+ "grad_norm": 0.33861225843429565,
2565
+ "learning_rate": 6.666261285306047e-05,
2566
+ "loss": 0.7396,
2567
+ "step": 362
2568
+ },
2569
+ {
2570
+ "epoch": 0.6446170921198668,
2571
+ "grad_norm": 0.3248923420906067,
2572
+ "learning_rate": 6.608587766067852e-05,
2573
+ "loss": 0.7158,
2574
+ "step": 363
2575
+ },
2576
+ {
2577
+ "epoch": 0.646392896781354,
2578
+ "grad_norm": 0.349185049533844,
2579
+ "learning_rate": 6.551041434273861e-05,
2580
+ "loss": 0.7415,
2581
+ "step": 364
2582
+ },
2583
+ {
2584
+ "epoch": 0.6481687014428413,
2585
+ "grad_norm": 0.33934569358825684,
2586
+ "learning_rate": 6.493624448072457e-05,
2587
+ "loss": 0.744,
2588
+ "step": 365
2589
+ },
2590
+ {
2591
+ "epoch": 0.6499445061043285,
2592
+ "grad_norm": 0.3628052771091461,
2593
+ "learning_rate": 6.43633896076122e-05,
2594
+ "loss": 0.7328,
2595
+ "step": 366
2596
+ },
2597
+ {
2598
+ "epoch": 0.6517203107658157,
2599
+ "grad_norm": 0.348979115486145,
2600
+ "learning_rate": 6.379187120706138e-05,
2601
+ "loss": 0.6755,
2602
+ "step": 367
2603
+ },
2604
+ {
2605
+ "epoch": 0.653496115427303,
2606
+ "grad_norm": 0.38474076986312866,
2607
+ "learning_rate": 6.322171071261071e-05,
2608
+ "loss": 0.711,
2609
+ "step": 368
2610
+ },
2611
+ {
2612
+ "epoch": 0.6552719200887902,
2613
+ "grad_norm": 0.34556257724761963,
2614
+ "learning_rate": 6.26529295068733e-05,
2615
+ "loss": 0.6995,
2616
+ "step": 369
2617
+ },
2618
+ {
2619
+ "epoch": 0.6570477247502775,
2620
+ "grad_norm": 0.4337230622768402,
2621
+ "learning_rate": 6.208554892073528e-05,
2622
+ "loss": 0.7412,
2623
+ "step": 370
2624
+ },
2625
+ {
2626
+ "epoch": 0.6588235294117647,
2627
+ "grad_norm": 0.37804853916168213,
2628
+ "learning_rate": 6.151959023255545e-05,
2629
+ "loss": 0.6724,
2630
+ "step": 371
2631
+ },
2632
+ {
2633
+ "epoch": 0.6605993340732519,
2634
+ "grad_norm": 0.40870919823646545,
2635
+ "learning_rate": 6.095507466736763e-05,
2636
+ "loss": 0.7243,
2637
+ "step": 372
2638
+ },
2639
+ {
2640
+ "epoch": 0.6623751387347392,
2641
+ "grad_norm": 0.45504140853881836,
2642
+ "learning_rate": 6.039202339608432e-05,
2643
+ "loss": 0.7373,
2644
+ "step": 373
2645
+ },
2646
+ {
2647
+ "epoch": 0.6641509433962264,
2648
+ "grad_norm": 0.46973538398742676,
2649
+ "learning_rate": 5.983045753470308e-05,
2650
+ "loss": 0.7101,
2651
+ "step": 374
2652
+ },
2653
+ {
2654
+ "epoch": 0.6659267480577137,
2655
+ "grad_norm": 0.5572993755340576,
2656
+ "learning_rate": 5.927039814351426e-05,
2657
+ "loss": 0.7393,
2658
+ "step": 375
2659
+ },
2660
+ {
2661
+ "epoch": 0.6677025527192009,
2662
+ "grad_norm": 0.2691468596458435,
2663
+ "learning_rate": 5.8711866226311553e-05,
2664
+ "loss": 0.8102,
2665
+ "step": 376
2666
+ },
2667
+ {
2668
+ "epoch": 0.6694783573806882,
2669
+ "grad_norm": 0.2898322641849518,
2670
+ "learning_rate": 5.8154882729603876e-05,
2671
+ "loss": 0.7968,
2672
+ "step": 377
2673
+ },
2674
+ {
2675
+ "epoch": 0.6712541620421754,
2676
+ "grad_norm": 0.3048444092273712,
2677
+ "learning_rate": 5.7599468541830356e-05,
2678
+ "loss": 0.775,
2679
+ "step": 378
2680
+ },
2681
+ {
2682
+ "epoch": 0.6730299667036626,
2683
+ "grad_norm": 0.3111611604690552,
2684
+ "learning_rate": 5.7045644492576346e-05,
2685
+ "loss": 0.7742,
2686
+ "step": 379
2687
+ },
2688
+ {
2689
+ "epoch": 0.6748057713651499,
2690
+ "grad_norm": 0.31889772415161133,
2691
+ "learning_rate": 5.64934313517927e-05,
2692
+ "loss": 0.7304,
2693
+ "step": 380
2694
+ },
2695
+ {
2696
+ "epoch": 0.676581576026637,
2697
+ "grad_norm": 0.3219664692878723,
2698
+ "learning_rate": 5.5942849829016695e-05,
2699
+ "loss": 0.7679,
2700
+ "step": 381
2701
+ },
2702
+ {
2703
+ "epoch": 0.6783573806881243,
2704
+ "grad_norm": 0.30955034494400024,
2705
+ "learning_rate": 5.5393920572595356e-05,
2706
+ "loss": 0.7443,
2707
+ "step": 382
2708
+ },
2709
+ {
2710
+ "epoch": 0.6801331853496115,
2711
+ "grad_norm": 0.344043105840683,
2712
+ "learning_rate": 5.484666416891109e-05,
2713
+ "loss": 0.7272,
2714
+ "step": 383
2715
+ },
2716
+ {
2717
+ "epoch": 0.6819089900110987,
2718
+ "grad_norm": 0.33895599842071533,
2719
+ "learning_rate": 5.430110114160964e-05,
2720
+ "loss": 0.7585,
2721
+ "step": 384
2722
+ },
2723
+ {
2724
+ "epoch": 0.683684794672586,
2725
+ "grad_norm": 0.37816834449768066,
2726
+ "learning_rate": 5.375725195083046e-05,
2727
+ "loss": 0.7749,
2728
+ "step": 385
2729
+ },
2730
+ {
2731
+ "epoch": 0.6854605993340732,
2732
+ "grad_norm": 0.3477395176887512,
2733
+ "learning_rate": 5.321513699243924e-05,
2734
+ "loss": 0.7022,
2735
+ "step": 386
2736
+ },
2737
+ {
2738
+ "epoch": 0.6872364039955605,
2739
+ "grad_norm": 0.3380398154258728,
2740
+ "learning_rate": 5.2674776597263186e-05,
2741
+ "loss": 0.7266,
2742
+ "step": 387
2743
+ },
2744
+ {
2745
+ "epoch": 0.6890122086570477,
2746
+ "grad_norm": 0.35505762696266174,
2747
+ "learning_rate": 5.2136191030328455e-05,
2748
+ "loss": 0.7411,
2749
+ "step": 388
2750
+ },
2751
+ {
2752
+ "epoch": 0.690788013318535,
2753
+ "grad_norm": 0.38739171624183655,
2754
+ "learning_rate": 5.159940049010015e-05,
2755
+ "loss": 0.7666,
2756
+ "step": 389
2757
+ },
2758
+ {
2759
+ "epoch": 0.6925638179800222,
2760
+ "grad_norm": 0.38473132252693176,
2761
+ "learning_rate": 5.106442510772489e-05,
2762
+ "loss": 0.7038,
2763
+ "step": 390
2764
+ },
2765
+ {
2766
+ "epoch": 0.6943396226415094,
2767
+ "grad_norm": 0.37635302543640137,
2768
+ "learning_rate": 5.0531284946275784e-05,
2769
+ "loss": 0.7488,
2770
+ "step": 391
2771
+ },
2772
+ {
2773
+ "epoch": 0.6961154273029967,
2774
+ "grad_norm": 0.37422046065330505,
2775
+ "learning_rate": 5.000000000000002e-05,
2776
+ "loss": 0.693,
2777
+ "step": 392
2778
+ },
2779
+ {
2780
+ "epoch": 0.6978912319644839,
2781
+ "grad_norm": 0.3987278342247009,
2782
+ "learning_rate": 4.9470590193569044e-05,
2783
+ "loss": 0.6965,
2784
+ "step": 393
2785
+ },
2786
+ {
2787
+ "epoch": 0.6996670366259712,
2788
+ "grad_norm": 0.34372609853744507,
2789
+ "learning_rate": 4.894307538133129e-05,
2790
+ "loss": 0.6632,
2791
+ "step": 394
2792
+ },
2793
+ {
2794
+ "epoch": 0.7014428412874584,
2795
+ "grad_norm": 0.4215118885040283,
2796
+ "learning_rate": 4.841747534656763e-05,
2797
+ "loss": 0.7081,
2798
+ "step": 395
2799
+ },
2800
+ {
2801
+ "epoch": 0.7032186459489456,
2802
+ "grad_norm": 0.4211183488368988,
2803
+ "learning_rate": 4.7893809800749403e-05,
2804
+ "loss": 0.687,
2805
+ "step": 396
2806
+ },
2807
+ {
2808
+ "epoch": 0.7049944506104329,
2809
+ "grad_norm": 0.44248080253601074,
2810
+ "learning_rate": 4.737209838279922e-05,
2811
+ "loss": 0.7118,
2812
+ "step": 397
2813
+ },
2814
+ {
2815
+ "epoch": 0.7067702552719201,
2816
+ "grad_norm": 0.38100606203079224,
2817
+ "learning_rate": 4.685236065835443e-05,
2818
+ "loss": 0.6259,
2819
+ "step": 398
2820
+ },
2821
+ {
2822
+ "epoch": 0.7085460599334074,
2823
+ "grad_norm": 0.46482354402542114,
2824
+ "learning_rate": 4.6334616119033356e-05,
2825
+ "loss": 0.6668,
2826
+ "step": 399
2827
+ },
2828
+ {
2829
+ "epoch": 0.7103218645948945,
2830
+ "grad_norm": 0.5484885573387146,
2831
+ "learning_rate": 4.5818884181704294e-05,
2832
+ "loss": 0.7973,
2833
+ "step": 400
2834
+ },
2835
+ {
2836
+ "epoch": 0.7120976692563818,
2837
+ "grad_norm": 0.2660059928894043,
2838
+ "learning_rate": 4.530518418775733e-05,
2839
+ "loss": 0.7845,
2840
+ "step": 401
2841
+ },
2842
+ {
2843
+ "epoch": 0.713873473917869,
2844
+ "grad_norm": 0.30005505681037903,
2845
+ "learning_rate": 4.479353540237903e-05,
2846
+ "loss": 0.8141,
2847
+ "step": 402
2848
+ },
2849
+ {
2850
+ "epoch": 0.7156492785793562,
2851
+ "grad_norm": 0.3031437397003174,
2852
+ "learning_rate": 4.4283957013829846e-05,
2853
+ "loss": 0.7505,
2854
+ "step": 403
2855
+ },
2856
+ {
2857
+ "epoch": 0.7174250832408435,
2858
+ "grad_norm": 0.3152884542942047,
2859
+ "learning_rate": 4.3776468132724604e-05,
2860
+ "loss": 0.8191,
2861
+ "step": 404
2862
+ },
2863
+ {
2864
+ "epoch": 0.7192008879023307,
2865
+ "grad_norm": 0.3122805058956146,
2866
+ "learning_rate": 4.3271087791315734e-05,
2867
+ "loss": 0.7732,
2868
+ "step": 405
2869
+ },
2870
+ {
2871
+ "epoch": 0.720976692563818,
2872
+ "grad_norm": 0.3241139054298401,
2873
+ "learning_rate": 4.276783494277954e-05,
2874
+ "loss": 0.7652,
2875
+ "step": 406
2876
+ },
2877
+ {
2878
+ "epoch": 0.7227524972253052,
2879
+ "grad_norm": 0.3523857295513153,
2880
+ "learning_rate": 4.2266728460505375e-05,
2881
+ "loss": 0.7923,
2882
+ "step": 407
2883
+ },
2884
+ {
2885
+ "epoch": 0.7245283018867924,
2886
+ "grad_norm": 0.3518478274345398,
2887
+ "learning_rate": 4.176778713738787e-05,
2888
+ "loss": 0.8046,
2889
+ "step": 408
2890
+ },
2891
+ {
2892
+ "epoch": 0.7263041065482797,
2893
+ "grad_norm": 0.35740435123443604,
2894
+ "learning_rate": 4.127102968512214e-05,
2895
+ "loss": 0.741,
2896
+ "step": 409
2897
+ },
2898
+ {
2899
+ "epoch": 0.7280799112097669,
2900
+ "grad_norm": 0.3561273217201233,
2901
+ "learning_rate": 4.077647473350201e-05,
2902
+ "loss": 0.7304,
2903
+ "step": 410
2904
+ },
2905
+ {
2906
+ "epoch": 0.7298557158712542,
2907
+ "grad_norm": 0.3595544397830963,
2908
+ "learning_rate": 4.028414082972141e-05,
2909
+ "loss": 0.7601,
2910
+ "step": 411
2911
+ },
2912
+ {
2913
+ "epoch": 0.7316315205327414,
2914
+ "grad_norm": 0.38603028655052185,
2915
+ "learning_rate": 3.97940464376787e-05,
2916
+ "loss": 0.768,
2917
+ "step": 412
2918
+ },
2919
+ {
2920
+ "epoch": 0.7334073251942287,
2921
+ "grad_norm": 0.347781240940094,
2922
+ "learning_rate": 3.9306209937284346e-05,
2923
+ "loss": 0.7255,
2924
+ "step": 413
2925
+ },
2926
+ {
2927
+ "epoch": 0.7351831298557159,
2928
+ "grad_norm": 0.3760242462158203,
2929
+ "learning_rate": 3.882064962377154e-05,
2930
+ "loss": 0.7371,
2931
+ "step": 414
2932
+ },
2933
+ {
2934
+ "epoch": 0.7369589345172031,
2935
+ "grad_norm": 0.359371542930603,
2936
+ "learning_rate": 3.83373837070101e-05,
2937
+ "loss": 0.7422,
2938
+ "step": 415
2939
+ },
2940
+ {
2941
+ "epoch": 0.7387347391786904,
2942
+ "grad_norm": 0.3574449419975281,
2943
+ "learning_rate": 3.7856430310823545e-05,
2944
+ "loss": 0.6915,
2945
+ "step": 416
2946
+ },
2947
+ {
2948
+ "epoch": 0.7405105438401776,
2949
+ "grad_norm": 0.3730245530605316,
2950
+ "learning_rate": 3.737780747230941e-05,
2951
+ "loss": 0.7309,
2952
+ "step": 417
2953
+ },
2954
+ {
2955
+ "epoch": 0.7422863485016649,
2956
+ "grad_norm": 0.36496400833129883,
2957
+ "learning_rate": 3.69015331411628e-05,
2958
+ "loss": 0.7245,
2959
+ "step": 418
2960
+ },
2961
+ {
2962
+ "epoch": 0.744062153163152,
2963
+ "grad_norm": 0.3593985140323639,
2964
+ "learning_rate": 3.642762517900322e-05,
2965
+ "loss": 0.6389,
2966
+ "step": 419
2967
+ },
2968
+ {
2969
+ "epoch": 0.7458379578246392,
2970
+ "grad_norm": 0.3603939116001129,
2971
+ "learning_rate": 3.595610135870472e-05,
2972
+ "loss": 0.703,
2973
+ "step": 420
2974
+ },
2975
+ {
2976
+ "epoch": 0.7476137624861265,
2977
+ "grad_norm": 0.397124320268631,
2978
+ "learning_rate": 3.548697936372937e-05,
2979
+ "loss": 0.7265,
2980
+ "step": 421
2981
+ },
2982
+ {
2983
+ "epoch": 0.7493895671476137,
2984
+ "grad_norm": 0.4071907103061676,
2985
+ "learning_rate": 3.5020276787464056e-05,
2986
+ "loss": 0.6752,
2987
+ "step": 422
2988
+ },
2989
+ {
2990
+ "epoch": 0.751165371809101,
2991
+ "grad_norm": 0.3834024965763092,
2992
+ "learning_rate": 3.455601113256073e-05,
2993
+ "loss": 0.6297,
2994
+ "step": 423
2995
+ },
2996
+ {
2997
+ "epoch": 0.751165371809101,
2998
+ "eval_loss": 0.7374839186668396,
2999
+ "eval_runtime": 156.6123,
3000
+ "eval_samples_per_second": 6.06,
3001
+ "eval_steps_per_second": 1.52,
3002
+ "step": 423
3003
  }
3004
  ],
3005
  "logging_steps": 1,
 
3019
  "attributes": {}
3020
  }
3021
  },
3022
+ "total_flos": 5.571234948741857e+17,
3023
  "train_batch_size": 2,
3024
  "trial_name": null,
3025
  "trial_params": null