akahana commited on
Commit
257a7e4
1 Parent(s): 3953f31

End of training

Browse files
README.md CHANGED
@@ -1,9 +1,24 @@
1
  ---
2
  tags:
3
  - generated_from_trainer
 
 
 
 
4
  model-index:
5
  - name: tinygpt2-javanese
6
- results: []
 
 
 
 
 
 
 
 
 
 
 
7
  ---
8
 
9
  <!-- This model card has been generated automatically according to the information the Trainer had access to. You
@@ -11,7 +26,10 @@ should probably proofread and complete it, then remove this comment. -->
11
 
12
  # tinygpt2-javanese
13
 
14
- This model is a fine-tuned version of [](https://huggingface.co/) on an unknown dataset.
 
 
 
15
 
16
  ## Model description
17
 
 
1
  ---
2
  tags:
3
  - generated_from_trainer
4
+ datasets:
5
+ - akahana/GlotCC-V1-jav-Latn
6
+ metrics:
7
+ - accuracy
8
  model-index:
9
  - name: tinygpt2-javanese
10
+ results:
11
+ - task:
12
+ name: Causal Language Modeling
13
+ type: text-generation
14
+ dataset:
15
+ name: akahana/GlotCC-V1-jav-Latn default
16
+ type: akahana/GlotCC-V1-jav-Latn
17
+ args: default
18
+ metrics:
19
+ - name: Accuracy
20
+ type: accuracy
21
+ value: 0.2907324408283162
22
  ---
23
 
24
  <!-- This model card has been generated automatically according to the information the Trainer had access to. You
 
26
 
27
  # tinygpt2-javanese
28
 
29
+ This model is a fine-tuned version of [](https://huggingface.co/) on the akahana/GlotCC-V1-jav-Latn default dataset.
30
+ It achieves the following results on the evaluation set:
31
+ - Loss: 4.6697
32
+ - Accuracy: 0.2907
33
 
34
  ## Model description
35
 
all_results.json CHANGED
@@ -1,16 +1,16 @@
1
  {
2
- "epoch": 30.0,
3
- "eval_accuracy": 0.2786154321383402,
4
- "eval_loss": 4.764777183532715,
5
- "eval_runtime": 11.4146,
6
  "eval_samples": 4053,
7
- "eval_samples_per_second": 355.07,
8
- "eval_steps_per_second": 88.833,
9
- "perplexity": 117.30497689511513,
10
- "total_flos": 733383917568000.0,
11
- "train_loss": 1.6002090492649228,
12
- "train_runtime": 2253.0761,
13
  "train_samples": 80219,
14
- "train_samples_per_second": 1068.126,
15
- "train_steps_per_second": 66.762
16
  }
 
1
  {
2
+ "epoch": 40.0,
3
+ "eval_accuracy": 0.2907324408283162,
4
+ "eval_loss": 4.6696553230285645,
5
+ "eval_runtime": 11.8905,
6
  "eval_samples": 4053,
7
+ "eval_samples_per_second": 340.862,
8
+ "eval_steps_per_second": 85.279,
9
+ "perplexity": 106.66097251496613,
10
+ "total_flos": 977845223424000.0,
11
+ "train_loss": 1.1379991389100752,
12
+ "train_runtime": 2290.6772,
13
  "train_samples": 80219,
14
+ "train_samples_per_second": 1400.791,
15
+ "train_steps_per_second": 87.555
16
  }
eval_results.json CHANGED
@@ -1,10 +1,10 @@
1
  {
2
- "epoch": 30.0,
3
- "eval_accuracy": 0.2786154321383402,
4
- "eval_loss": 4.764777183532715,
5
- "eval_runtime": 11.4146,
6
  "eval_samples": 4053,
7
- "eval_samples_per_second": 355.07,
8
- "eval_steps_per_second": 88.833,
9
- "perplexity": 117.30497689511513
10
  }
 
1
  {
2
+ "epoch": 40.0,
3
+ "eval_accuracy": 0.2907324408283162,
4
+ "eval_loss": 4.6696553230285645,
5
+ "eval_runtime": 11.8905,
6
  "eval_samples": 4053,
7
+ "eval_samples_per_second": 340.862,
8
+ "eval_steps_per_second": 85.279,
9
+ "perplexity": 106.66097251496613
10
  }
runs/Jul23_00-29-17_e063f4f30784/events.out.tfevents.1721696919.e063f4f30784.18221.1 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d74593e1c6ce518de7f2f295c25d67ebcd0f4dfde1a024430a44290cb68618a2
3
+ size 417
train_results.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
- "epoch": 30.0,
3
- "total_flos": 733383917568000.0,
4
- "train_loss": 1.6002090492649228,
5
- "train_runtime": 2253.0761,
6
  "train_samples": 80219,
7
- "train_samples_per_second": 1068.126,
8
- "train_steps_per_second": 66.762
9
  }
 
1
  {
2
+ "epoch": 40.0,
3
+ "total_flos": 977845223424000.0,
4
+ "train_loss": 1.1379991389100752,
5
+ "train_runtime": 2290.6772,
6
  "train_samples": 80219,
7
+ "train_samples_per_second": 1400.791,
8
+ "train_steps_per_second": 87.555
9
  }
trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 30.0,
5
  "eval_steps": 500,
6
- "global_step": 150420,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -2125,12 +2125,728 @@
2125
  "train_runtime": 2253.0761,
2126
  "train_samples_per_second": 1068.126,
2127
  "train_steps_per_second": 66.762
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2128
  }
2129
  ],
2130
  "logging_steps": 500,
2131
- "max_steps": 150420,
2132
  "num_input_tokens_seen": 0,
2133
- "num_train_epochs": 30,
2134
  "save_steps": 500,
2135
  "stateful_callbacks": {
2136
  "TrainerControl": {
@@ -2144,7 +2860,7 @@
2144
  "attributes": {}
2145
  }
2146
  },
2147
- "total_flos": 733383917568000.0,
2148
  "train_batch_size": 16,
2149
  "trial_name": null,
2150
  "trial_params": null
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 40.0,
5
  "eval_steps": 500,
6
+ "global_step": 200560,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
2125
  "train_runtime": 2253.0761,
2126
  "train_samples_per_second": 1068.126,
2127
  "train_steps_per_second": 66.762
2128
+ },
2129
+ {
2130
+ "epoch": 30.01595532508975,
2131
+ "grad_norm": 3.4099204540252686,
2132
+ "learning_rate": 4.998005584363782e-05,
2133
+ "loss": 4.671,
2134
+ "step": 150500
2135
+ },
2136
+ {
2137
+ "epoch": 30.11567610690068,
2138
+ "grad_norm": 3.130218029022217,
2139
+ "learning_rate": 4.985540486637415e-05,
2140
+ "loss": 4.6465,
2141
+ "step": 151000
2142
+ },
2143
+ {
2144
+ "epoch": 30.215396888711606,
2145
+ "grad_norm": 3.1219522953033447,
2146
+ "learning_rate": 4.9730753889110494e-05,
2147
+ "loss": 4.6809,
2148
+ "step": 151500
2149
+ },
2150
+ {
2151
+ "epoch": 30.315117670522536,
2152
+ "grad_norm": 3.1578962802886963,
2153
+ "learning_rate": 4.960610291184683e-05,
2154
+ "loss": 4.6582,
2155
+ "step": 152000
2156
+ },
2157
+ {
2158
+ "epoch": 30.414838452333466,
2159
+ "grad_norm": 3.1061723232269287,
2160
+ "learning_rate": 4.948145193458317e-05,
2161
+ "loss": 4.6614,
2162
+ "step": 152500
2163
+ },
2164
+ {
2165
+ "epoch": 30.514559234144397,
2166
+ "grad_norm": 3.1964001655578613,
2167
+ "learning_rate": 4.935680095731951e-05,
2168
+ "loss": 4.6936,
2169
+ "step": 153000
2170
+ },
2171
+ {
2172
+ "epoch": 30.614280015955327,
2173
+ "grad_norm": 3.108321189880371,
2174
+ "learning_rate": 4.9232149980055846e-05,
2175
+ "loss": 4.6881,
2176
+ "step": 153500
2177
+ },
2178
+ {
2179
+ "epoch": 30.714000797766253,
2180
+ "grad_norm": 3.1734015941619873,
2181
+ "learning_rate": 4.910749900279219e-05,
2182
+ "loss": 4.6532,
2183
+ "step": 154000
2184
+ },
2185
+ {
2186
+ "epoch": 30.813721579577184,
2187
+ "grad_norm": 3.293867349624634,
2188
+ "learning_rate": 4.898309732748305e-05,
2189
+ "loss": 4.6614,
2190
+ "step": 154500
2191
+ },
2192
+ {
2193
+ "epoch": 30.913442361388114,
2194
+ "grad_norm": 3.0667290687561035,
2195
+ "learning_rate": 4.885844635021939e-05,
2196
+ "loss": 4.6543,
2197
+ "step": 155000
2198
+ },
2199
+ {
2200
+ "epoch": 31.013163143199044,
2201
+ "grad_norm": 3.084986925125122,
2202
+ "learning_rate": 4.8733795372955726e-05,
2203
+ "loss": 4.6631,
2204
+ "step": 155500
2205
+ },
2206
+ {
2207
+ "epoch": 31.11288392500997,
2208
+ "grad_norm": 3.3350682258605957,
2209
+ "learning_rate": 4.860914439569206e-05,
2210
+ "loss": 4.6084,
2211
+ "step": 156000
2212
+ },
2213
+ {
2214
+ "epoch": 31.2126047068209,
2215
+ "grad_norm": 3.097383737564087,
2216
+ "learning_rate": 4.848474272038293e-05,
2217
+ "loss": 4.6217,
2218
+ "step": 156500
2219
+ },
2220
+ {
2221
+ "epoch": 31.31232548863183,
2222
+ "grad_norm": 3.11267352104187,
2223
+ "learning_rate": 4.836009174311927e-05,
2224
+ "loss": 4.6337,
2225
+ "step": 157000
2226
+ },
2227
+ {
2228
+ "epoch": 31.41204627044276,
2229
+ "grad_norm": 3.318540096282959,
2230
+ "learning_rate": 4.8235440765855605e-05,
2231
+ "loss": 4.637,
2232
+ "step": 157500
2233
+ },
2234
+ {
2235
+ "epoch": 31.51176705225369,
2236
+ "grad_norm": 3.259889841079712,
2237
+ "learning_rate": 4.811078978859194e-05,
2238
+ "loss": 4.6348,
2239
+ "step": 158000
2240
+ },
2241
+ {
2242
+ "epoch": 31.61148783406462,
2243
+ "grad_norm": 3.1023237705230713,
2244
+ "learning_rate": 4.798638811328281e-05,
2245
+ "loss": 4.6179,
2246
+ "step": 158500
2247
+ },
2248
+ {
2249
+ "epoch": 31.71120861587555,
2250
+ "grad_norm": 3.0736587047576904,
2251
+ "learning_rate": 4.786173713601915e-05,
2252
+ "loss": 4.6231,
2253
+ "step": 159000
2254
+ },
2255
+ {
2256
+ "epoch": 31.81092939768648,
2257
+ "grad_norm": 3.019336700439453,
2258
+ "learning_rate": 4.773708615875549e-05,
2259
+ "loss": 4.6274,
2260
+ "step": 159500
2261
+ },
2262
+ {
2263
+ "epoch": 31.910650179497406,
2264
+ "grad_norm": 3.163273334503174,
2265
+ "learning_rate": 4.7612435181491826e-05,
2266
+ "loss": 4.6486,
2267
+ "step": 160000
2268
+ },
2269
+ {
2270
+ "epoch": 32.01037096130834,
2271
+ "grad_norm": 2.973386287689209,
2272
+ "learning_rate": 4.748803350618269e-05,
2273
+ "loss": 4.6158,
2274
+ "step": 160500
2275
+ },
2276
+ {
2277
+ "epoch": 32.11009174311926,
2278
+ "grad_norm": 3.2876758575439453,
2279
+ "learning_rate": 4.736338252891903e-05,
2280
+ "loss": 4.5567,
2281
+ "step": 161000
2282
+ },
2283
+ {
2284
+ "epoch": 32.20981252493019,
2285
+ "grad_norm": 3.2019705772399902,
2286
+ "learning_rate": 4.723898085360989e-05,
2287
+ "loss": 4.5851,
2288
+ "step": 161500
2289
+ },
2290
+ {
2291
+ "epoch": 32.30953330674112,
2292
+ "grad_norm": 3.1916842460632324,
2293
+ "learning_rate": 4.711432987634623e-05,
2294
+ "loss": 4.5754,
2295
+ "step": 162000
2296
+ },
2297
+ {
2298
+ "epoch": 32.40925408855205,
2299
+ "grad_norm": 3.4018754959106445,
2300
+ "learning_rate": 4.6989678899082575e-05,
2301
+ "loss": 4.6011,
2302
+ "step": 162500
2303
+ },
2304
+ {
2305
+ "epoch": 32.508974870362984,
2306
+ "grad_norm": 3.220608949661255,
2307
+ "learning_rate": 4.686502792181891e-05,
2308
+ "loss": 4.5961,
2309
+ "step": 163000
2310
+ },
2311
+ {
2312
+ "epoch": 32.608695652173914,
2313
+ "grad_norm": 3.357311248779297,
2314
+ "learning_rate": 4.674037694455525e-05,
2315
+ "loss": 4.595,
2316
+ "step": 163500
2317
+ },
2318
+ {
2319
+ "epoch": 32.708416433984844,
2320
+ "grad_norm": 3.136885166168213,
2321
+ "learning_rate": 4.6615725967291585e-05,
2322
+ "loss": 4.6206,
2323
+ "step": 164000
2324
+ },
2325
+ {
2326
+ "epoch": 32.808137215795774,
2327
+ "grad_norm": 3.2210259437561035,
2328
+ "learning_rate": 4.649107499002792e-05,
2329
+ "loss": 4.6276,
2330
+ "step": 164500
2331
+ },
2332
+ {
2333
+ "epoch": 32.907857997606705,
2334
+ "grad_norm": 3.152240037918091,
2335
+ "learning_rate": 4.636642401276427e-05,
2336
+ "loss": 4.6193,
2337
+ "step": 165000
2338
+ },
2339
+ {
2340
+ "epoch": 33.00757877941763,
2341
+ "grad_norm": 3.079972505569458,
2342
+ "learning_rate": 4.624202233745512e-05,
2343
+ "loss": 4.605,
2344
+ "step": 165500
2345
+ },
2346
+ {
2347
+ "epoch": 33.10729956122856,
2348
+ "grad_norm": 3.3014163970947266,
2349
+ "learning_rate": 4.6117371360191464e-05,
2350
+ "loss": 4.5538,
2351
+ "step": 166000
2352
+ },
2353
+ {
2354
+ "epoch": 33.20702034303949,
2355
+ "grad_norm": 3.4153923988342285,
2356
+ "learning_rate": 4.5992720382927806e-05,
2357
+ "loss": 4.5645,
2358
+ "step": 166500
2359
+ },
2360
+ {
2361
+ "epoch": 33.30674112485042,
2362
+ "grad_norm": 3.6268794536590576,
2363
+ "learning_rate": 4.586806940566414e-05,
2364
+ "loss": 4.5478,
2365
+ "step": 167000
2366
+ },
2367
+ {
2368
+ "epoch": 33.40646190666135,
2369
+ "grad_norm": 3.0210297107696533,
2370
+ "learning_rate": 4.5743667730355e-05,
2371
+ "loss": 4.5944,
2372
+ "step": 167500
2373
+ },
2374
+ {
2375
+ "epoch": 33.50618268847228,
2376
+ "grad_norm": 3.265434503555298,
2377
+ "learning_rate": 4.561901675309135e-05,
2378
+ "loss": 4.5617,
2379
+ "step": 168000
2380
+ },
2381
+ {
2382
+ "epoch": 33.60590347028321,
2383
+ "grad_norm": 3.1370913982391357,
2384
+ "learning_rate": 4.5494365775827685e-05,
2385
+ "loss": 4.585,
2386
+ "step": 168500
2387
+ },
2388
+ {
2389
+ "epoch": 33.70562425209414,
2390
+ "grad_norm": 3.4594709873199463,
2391
+ "learning_rate": 4.536971479856402e-05,
2392
+ "loss": 4.5721,
2393
+ "step": 169000
2394
+ },
2395
+ {
2396
+ "epoch": 33.80534503390506,
2397
+ "grad_norm": 3.162144899368286,
2398
+ "learning_rate": 4.524531312325489e-05,
2399
+ "loss": 4.6202,
2400
+ "step": 169500
2401
+ },
2402
+ {
2403
+ "epoch": 33.90506581571599,
2404
+ "grad_norm": 3.464153528213501,
2405
+ "learning_rate": 4.512066214599123e-05,
2406
+ "loss": 4.5634,
2407
+ "step": 170000
2408
+ },
2409
+ {
2410
+ "epoch": 34.00478659752692,
2411
+ "grad_norm": 3.2576406002044678,
2412
+ "learning_rate": 4.4996011168727565e-05,
2413
+ "loss": 4.5797,
2414
+ "step": 170500
2415
+ },
2416
+ {
2417
+ "epoch": 34.10450737933785,
2418
+ "grad_norm": 3.270254373550415,
2419
+ "learning_rate": 4.48713601914639e-05,
2420
+ "loss": 4.541,
2421
+ "step": 171000
2422
+ },
2423
+ {
2424
+ "epoch": 34.204228161148784,
2425
+ "grad_norm": 3.1199378967285156,
2426
+ "learning_rate": 4.474695851615477e-05,
2427
+ "loss": 4.5263,
2428
+ "step": 171500
2429
+ },
2430
+ {
2431
+ "epoch": 34.303948942959714,
2432
+ "grad_norm": 3.253614664077759,
2433
+ "learning_rate": 4.46223075388911e-05,
2434
+ "loss": 4.5489,
2435
+ "step": 172000
2436
+ },
2437
+ {
2438
+ "epoch": 34.403669724770644,
2439
+ "grad_norm": 3.2037832736968994,
2440
+ "learning_rate": 4.4497656561627444e-05,
2441
+ "loss": 4.5551,
2442
+ "step": 172500
2443
+ },
2444
+ {
2445
+ "epoch": 34.503390506581574,
2446
+ "grad_norm": 3.099489450454712,
2447
+ "learning_rate": 4.4373005584363786e-05,
2448
+ "loss": 4.5527,
2449
+ "step": 173000
2450
+ },
2451
+ {
2452
+ "epoch": 34.6031112883925,
2453
+ "grad_norm": 3.2268526554107666,
2454
+ "learning_rate": 4.424860390905465e-05,
2455
+ "loss": 4.5489,
2456
+ "step": 173500
2457
+ },
2458
+ {
2459
+ "epoch": 34.70283207020343,
2460
+ "grad_norm": 3.2449424266815186,
2461
+ "learning_rate": 4.412395293179098e-05,
2462
+ "loss": 4.5515,
2463
+ "step": 174000
2464
+ },
2465
+ {
2466
+ "epoch": 34.80255285201436,
2467
+ "grad_norm": 3.177513837814331,
2468
+ "learning_rate": 4.399930195452733e-05,
2469
+ "loss": 4.551,
2470
+ "step": 174500
2471
+ },
2472
+ {
2473
+ "epoch": 34.90227363382529,
2474
+ "grad_norm": 3.1355063915252686,
2475
+ "learning_rate": 4.3874650977263665e-05,
2476
+ "loss": 4.5897,
2477
+ "step": 175000
2478
+ },
2479
+ {
2480
+ "epoch": 35.00199441563622,
2481
+ "grad_norm": 3.0109355449676514,
2482
+ "learning_rate": 4.375024930195453e-05,
2483
+ "loss": 4.5528,
2484
+ "step": 175500
2485
+ },
2486
+ {
2487
+ "epoch": 35.10171519744715,
2488
+ "grad_norm": 3.0925559997558594,
2489
+ "learning_rate": 4.362559832469087e-05,
2490
+ "loss": 4.4931,
2491
+ "step": 176000
2492
+ },
2493
+ {
2494
+ "epoch": 35.20143597925808,
2495
+ "grad_norm": 3.3511645793914795,
2496
+ "learning_rate": 4.350094734742721e-05,
2497
+ "loss": 4.5321,
2498
+ "step": 176500
2499
+ },
2500
+ {
2501
+ "epoch": 35.30115676106901,
2502
+ "grad_norm": 3.2780284881591797,
2503
+ "learning_rate": 4.3376296370163545e-05,
2504
+ "loss": 4.5354,
2505
+ "step": 177000
2506
+ },
2507
+ {
2508
+ "epoch": 35.40087754287994,
2509
+ "grad_norm": 3.305748224258423,
2510
+ "learning_rate": 4.3251894694854414e-05,
2511
+ "loss": 4.5359,
2512
+ "step": 177500
2513
+ },
2514
+ {
2515
+ "epoch": 35.50059832469086,
2516
+ "grad_norm": 3.3514251708984375,
2517
+ "learning_rate": 4.312724371759075e-05,
2518
+ "loss": 4.5107,
2519
+ "step": 178000
2520
+ },
2521
+ {
2522
+ "epoch": 35.60031910650179,
2523
+ "grad_norm": 3.582073211669922,
2524
+ "learning_rate": 4.300259274032708e-05,
2525
+ "loss": 4.5671,
2526
+ "step": 178500
2527
+ },
2528
+ {
2529
+ "epoch": 35.70003988831272,
2530
+ "grad_norm": 3.216836929321289,
2531
+ "learning_rate": 4.2877941763063424e-05,
2532
+ "loss": 4.555,
2533
+ "step": 179000
2534
+ },
2535
+ {
2536
+ "epoch": 35.79976067012365,
2537
+ "grad_norm": 3.314234495162964,
2538
+ "learning_rate": 4.275354008775429e-05,
2539
+ "loss": 4.5135,
2540
+ "step": 179500
2541
+ },
2542
+ {
2543
+ "epoch": 35.899481451934584,
2544
+ "grad_norm": 3.3673787117004395,
2545
+ "learning_rate": 4.262888911049063e-05,
2546
+ "loss": 4.5517,
2547
+ "step": 180000
2548
+ },
2549
+ {
2550
+ "epoch": 35.999202233745514,
2551
+ "grad_norm": 3.159799814224243,
2552
+ "learning_rate": 4.250423813322696e-05,
2553
+ "loss": 4.534,
2554
+ "step": 180500
2555
+ },
2556
+ {
2557
+ "epoch": 36.098923015556444,
2558
+ "grad_norm": 3.5018651485443115,
2559
+ "learning_rate": 4.237958715596331e-05,
2560
+ "loss": 4.5052,
2561
+ "step": 181000
2562
+ },
2563
+ {
2564
+ "epoch": 36.198643797367374,
2565
+ "grad_norm": 3.127002239227295,
2566
+ "learning_rate": 4.2255185480654166e-05,
2567
+ "loss": 4.4988,
2568
+ "step": 181500
2569
+ },
2570
+ {
2571
+ "epoch": 36.2983645791783,
2572
+ "grad_norm": 3.4011449813842773,
2573
+ "learning_rate": 4.213053450339051e-05,
2574
+ "loss": 4.4995,
2575
+ "step": 182000
2576
+ },
2577
+ {
2578
+ "epoch": 36.39808536098923,
2579
+ "grad_norm": 3.3019766807556152,
2580
+ "learning_rate": 4.200588352612685e-05,
2581
+ "loss": 4.4966,
2582
+ "step": 182500
2583
+ },
2584
+ {
2585
+ "epoch": 36.49780614280016,
2586
+ "grad_norm": 3.263709545135498,
2587
+ "learning_rate": 4.188123254886318e-05,
2588
+ "loss": 4.5205,
2589
+ "step": 183000
2590
+ },
2591
+ {
2592
+ "epoch": 36.59752692461109,
2593
+ "grad_norm": 3.4110920429229736,
2594
+ "learning_rate": 4.1756830873554045e-05,
2595
+ "loss": 4.5287,
2596
+ "step": 183500
2597
+ },
2598
+ {
2599
+ "epoch": 36.69724770642202,
2600
+ "grad_norm": 3.379786729812622,
2601
+ "learning_rate": 4.1632179896290393e-05,
2602
+ "loss": 4.5255,
2603
+ "step": 184000
2604
+ },
2605
+ {
2606
+ "epoch": 36.79696848823295,
2607
+ "grad_norm": 3.21069073677063,
2608
+ "learning_rate": 4.150752891902673e-05,
2609
+ "loss": 4.5143,
2610
+ "step": 184500
2611
+ },
2612
+ {
2613
+ "epoch": 36.89668927004388,
2614
+ "grad_norm": 3.2103688716888428,
2615
+ "learning_rate": 4.138287794176306e-05,
2616
+ "loss": 4.5236,
2617
+ "step": 185000
2618
+ },
2619
+ {
2620
+ "epoch": 36.99641005185481,
2621
+ "grad_norm": 3.1441774368286133,
2622
+ "learning_rate": 4.125847626645393e-05,
2623
+ "loss": 4.5418,
2624
+ "step": 185500
2625
+ },
2626
+ {
2627
+ "epoch": 37.09613083366573,
2628
+ "grad_norm": 3.385601043701172,
2629
+ "learning_rate": 4.113382528919027e-05,
2630
+ "loss": 4.4761,
2631
+ "step": 186000
2632
+ },
2633
+ {
2634
+ "epoch": 37.19585161547666,
2635
+ "grad_norm": 3.280444383621216,
2636
+ "learning_rate": 4.100917431192661e-05,
2637
+ "loss": 4.4938,
2638
+ "step": 186500
2639
+ },
2640
+ {
2641
+ "epoch": 37.29557239728759,
2642
+ "grad_norm": 3.104619026184082,
2643
+ "learning_rate": 4.088452333466294e-05,
2644
+ "loss": 4.4833,
2645
+ "step": 187000
2646
+ },
2647
+ {
2648
+ "epoch": 37.39529317909852,
2649
+ "grad_norm": 3.2329983711242676,
2650
+ "learning_rate": 4.076012165935381e-05,
2651
+ "loss": 4.494,
2652
+ "step": 187500
2653
+ },
2654
+ {
2655
+ "epoch": 37.49501396090945,
2656
+ "grad_norm": 3.4772567749023438,
2657
+ "learning_rate": 4.0635470682090146e-05,
2658
+ "loss": 4.5045,
2659
+ "step": 188000
2660
+ },
2661
+ {
2662
+ "epoch": 37.594734742720384,
2663
+ "grad_norm": 3.395953416824341,
2664
+ "learning_rate": 4.051081970482649e-05,
2665
+ "loss": 4.5083,
2666
+ "step": 188500
2667
+ },
2668
+ {
2669
+ "epoch": 37.694455524531314,
2670
+ "grad_norm": 3.3525466918945312,
2671
+ "learning_rate": 4.038616872756283e-05,
2672
+ "loss": 4.4781,
2673
+ "step": 189000
2674
+ },
2675
+ {
2676
+ "epoch": 37.794176306342244,
2677
+ "grad_norm": 3.308446168899536,
2678
+ "learning_rate": 4.026176705225369e-05,
2679
+ "loss": 4.5099,
2680
+ "step": 189500
2681
+ },
2682
+ {
2683
+ "epoch": 37.893897088153174,
2684
+ "grad_norm": 3.25447940826416,
2685
+ "learning_rate": 4.0137116074990025e-05,
2686
+ "loss": 4.5189,
2687
+ "step": 190000
2688
+ },
2689
+ {
2690
+ "epoch": 37.9936178699641,
2691
+ "grad_norm": 3.4862465858459473,
2692
+ "learning_rate": 4.001246509772637e-05,
2693
+ "loss": 4.5157,
2694
+ "step": 190500
2695
+ },
2696
+ {
2697
+ "epoch": 38.09333865177503,
2698
+ "grad_norm": 3.549028158187866,
2699
+ "learning_rate": 3.988781412046271e-05,
2700
+ "loss": 4.4673,
2701
+ "step": 191000
2702
+ },
2703
+ {
2704
+ "epoch": 38.19305943358596,
2705
+ "grad_norm": 3.362783670425415,
2706
+ "learning_rate": 3.976341244515357e-05,
2707
+ "loss": 4.4716,
2708
+ "step": 191500
2709
+ },
2710
+ {
2711
+ "epoch": 38.29278021539689,
2712
+ "grad_norm": 3.235966205596924,
2713
+ "learning_rate": 3.963876146788991e-05,
2714
+ "loss": 4.4923,
2715
+ "step": 192000
2716
+ },
2717
+ {
2718
+ "epoch": 38.39250099720782,
2719
+ "grad_norm": 3.627629518508911,
2720
+ "learning_rate": 3.951411049062625e-05,
2721
+ "loss": 4.5132,
2722
+ "step": 192500
2723
+ },
2724
+ {
2725
+ "epoch": 38.49222177901875,
2726
+ "grad_norm": 3.3382019996643066,
2727
+ "learning_rate": 3.938945951336259e-05,
2728
+ "loss": 4.4827,
2729
+ "step": 193000
2730
+ },
2731
+ {
2732
+ "epoch": 38.59194256082968,
2733
+ "grad_norm": 3.363459587097168,
2734
+ "learning_rate": 3.9265057838053456e-05,
2735
+ "loss": 4.4661,
2736
+ "step": 193500
2737
+ },
2738
+ {
2739
+ "epoch": 38.69166334264061,
2740
+ "grad_norm": 3.1365175247192383,
2741
+ "learning_rate": 3.914040686078979e-05,
2742
+ "loss": 4.5025,
2743
+ "step": 194000
2744
+ },
2745
+ {
2746
+ "epoch": 38.79138412445153,
2747
+ "grad_norm": 3.4474666118621826,
2748
+ "learning_rate": 3.9015755883526125e-05,
2749
+ "loss": 4.4787,
2750
+ "step": 194500
2751
+ },
2752
+ {
2753
+ "epoch": 38.89110490626246,
2754
+ "grad_norm": 3.3024516105651855,
2755
+ "learning_rate": 3.889110490626247e-05,
2756
+ "loss": 4.4861,
2757
+ "step": 195000
2758
+ },
2759
+ {
2760
+ "epoch": 38.99082568807339,
2761
+ "grad_norm": 3.238717555999756,
2762
+ "learning_rate": 3.8766703230953336e-05,
2763
+ "loss": 4.4912,
2764
+ "step": 195500
2765
+ },
2766
+ {
2767
+ "epoch": 39.09054646988432,
2768
+ "grad_norm": 3.4745333194732666,
2769
+ "learning_rate": 3.864205225368967e-05,
2770
+ "loss": 4.4664,
2771
+ "step": 196000
2772
+ },
2773
+ {
2774
+ "epoch": 39.19026725169525,
2775
+ "grad_norm": 3.440810441970825,
2776
+ "learning_rate": 3.8517401276426005e-05,
2777
+ "loss": 4.4892,
2778
+ "step": 196500
2779
+ },
2780
+ {
2781
+ "epoch": 39.289988033506184,
2782
+ "grad_norm": 3.4101953506469727,
2783
+ "learning_rate": 3.839275029916235e-05,
2784
+ "loss": 4.4537,
2785
+ "step": 197000
2786
+ },
2787
+ {
2788
+ "epoch": 39.389708815317114,
2789
+ "grad_norm": 3.3550708293914795,
2790
+ "learning_rate": 3.826834862385321e-05,
2791
+ "loss": 4.4677,
2792
+ "step": 197500
2793
+ },
2794
+ {
2795
+ "epoch": 39.489429597128044,
2796
+ "grad_norm": 3.289698600769043,
2797
+ "learning_rate": 3.814369764658955e-05,
2798
+ "loss": 4.4651,
2799
+ "step": 198000
2800
+ },
2801
+ {
2802
+ "epoch": 39.58915037893897,
2803
+ "grad_norm": 3.1596176624298096,
2804
+ "learning_rate": 3.801904666932589e-05,
2805
+ "loss": 4.4701,
2806
+ "step": 198500
2807
+ },
2808
+ {
2809
+ "epoch": 39.6888711607499,
2810
+ "grad_norm": 3.2771639823913574,
2811
+ "learning_rate": 3.7894395692062226e-05,
2812
+ "loss": 4.4671,
2813
+ "step": 199000
2814
+ },
2815
+ {
2816
+ "epoch": 39.78859194256083,
2817
+ "grad_norm": 3.5630061626434326,
2818
+ "learning_rate": 3.776999401675309e-05,
2819
+ "loss": 4.4655,
2820
+ "step": 199500
2821
+ },
2822
+ {
2823
+ "epoch": 39.88831272437176,
2824
+ "grad_norm": 3.4128897190093994,
2825
+ "learning_rate": 3.7645343039489436e-05,
2826
+ "loss": 4.4844,
2827
+ "step": 200000
2828
+ },
2829
+ {
2830
+ "epoch": 39.98803350618269,
2831
+ "grad_norm": 3.294849395751953,
2832
+ "learning_rate": 3.752069206222577e-05,
2833
+ "loss": 4.4722,
2834
+ "step": 200500
2835
+ },
2836
+ {
2837
+ "epoch": 40.0,
2838
+ "step": 200560,
2839
+ "total_flos": 977845223424000.0,
2840
+ "train_loss": 1.1379991389100752,
2841
+ "train_runtime": 2290.6772,
2842
+ "train_samples_per_second": 1400.791,
2843
+ "train_steps_per_second": 87.555
2844
  }
2845
  ],
2846
  "logging_steps": 500,
2847
+ "max_steps": 200560,
2848
  "num_input_tokens_seen": 0,
2849
+ "num_train_epochs": 40,
2850
  "save_steps": 500,
2851
  "stateful_callbacks": {
2852
  "TrainerControl": {
 
2860
  "attributes": {}
2861
  }
2862
  },
2863
+ "total_flos": 977845223424000.0,
2864
  "train_batch_size": 16,
2865
  "trial_name": null,
2866
  "trial_params": null