Azrail commited on
Commit
ef24e65
·
verified ·
1 Parent(s): 97e8c16

Training in progress, step 137000, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:2fe644242ac85364957a221ecb3fda251252bbb21f78dcf32d44ddb45cee4b8c
3
  size 1410301944
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0b38436cae5381f691ba804b915e325932d55429d83532b1470e95efd579a29b
3
  size 1410301944
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:9a0bb2637b2d27c703e80119c30822f6cacfac9cba885cfe1635772ce684b387
3
  size 2820185786
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0b3d9c01ac2fd401fd65707f0e1d6a24eefcca9fe471c863196aa9b97efe6f47
3
  size 2820185786
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:5c2ffcf5f582912b4a7016b15e29048dddaa402730efcd133059a2e08945301c
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c7354a4e3d8de85b55d51bbeb0dfcfc86efd5d09ac4e401efe6b4ee83bc0b66a
3
  size 14244
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:ff50fa4a38896a05eab7dc1bfd456c8019098d112a942a25a411381c6596e51c
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1bf416de216a0fa7180c9c5b3632984e63b58047aa8bc6d944e50f798fb000d5
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -2,9 +2,9 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 1.297449229044683,
6
  "eval_steps": 500,
7
- "global_step": 136000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -24216,11 +24216,189 @@
24216
  "eval_steps_per_second": 15.073,
24217
  "num_input_tokens_seen": 71291638272,
24218
  "step": 136000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
24219
  }
24220
  ],
24221
  "logging_steps": 50,
24222
  "max_steps": 140000,
24223
- "num_input_tokens_seen": 71291638272,
24224
  "num_train_epochs": 2,
24225
  "save_steps": 1000,
24226
  "stateful_callbacks": {
@@ -24235,7 +24413,7 @@
24235
  "attributes": {}
24236
  }
24237
  },
24238
- "total_flos": 1.2617319614661919e+20,
24239
  "train_batch_size": 32,
24240
  "trial_name": null,
24241
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 1.3069892793684486,
6
  "eval_steps": 500,
7
+ "global_step": 137000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
24216
  "eval_steps_per_second": 15.073,
24217
  "num_input_tokens_seen": 71291638272,
24218
  "step": 136000
24219
+ },
24220
+ {
24221
+ "epoch": 1.2979262315608715,
24222
+ "grad_norm": 0.1190498098731041,
24223
+ "learning_rate": 4.8305620184135315e-05,
24224
+ "loss": 2.0321,
24225
+ "num_input_tokens_seen": 71317844512,
24226
+ "step": 136050
24227
+ },
24228
+ {
24229
+ "epoch": 1.2984032340770597,
24230
+ "grad_norm": 0.11770997196435928,
24231
+ "learning_rate": 4.7109889986402973e-05,
24232
+ "loss": 2.0341,
24233
+ "num_input_tokens_seen": 71344050560,
24234
+ "step": 136100
24235
+ },
24236
+ {
24237
+ "epoch": 1.2988802365932481,
24238
+ "grad_norm": 0.11683844774961472,
24239
+ "learning_rate": 4.592841308745932e-05,
24240
+ "loss": 2.0243,
24241
+ "num_input_tokens_seen": 71370258656,
24242
+ "step": 136150
24243
+ },
24244
+ {
24245
+ "epoch": 1.2993572391094363,
24246
+ "grad_norm": 0.12114414572715759,
24247
+ "learning_rate": 4.476122667059207e-05,
24248
+ "loss": 2.0379,
24249
+ "num_input_tokens_seen": 71396470656,
24250
+ "step": 136200
24251
+ },
24252
+ {
24253
+ "epoch": 1.2998342416256246,
24254
+ "grad_norm": 0.11975762993097305,
24255
+ "learning_rate": 4.3608367469340547e-05,
24256
+ "loss": 2.0359,
24257
+ "num_input_tokens_seen": 71422685056,
24258
+ "step": 136250
24259
+ },
24260
+ {
24261
+ "epoch": 1.3003112441418128,
24262
+ "grad_norm": 0.11278797686100006,
24263
+ "learning_rate": 4.2469871766340095e-05,
24264
+ "loss": 2.0219,
24265
+ "num_input_tokens_seen": 71448892928,
24266
+ "step": 136300
24267
+ },
24268
+ {
24269
+ "epoch": 1.3007882466580012,
24270
+ "grad_norm": 0.11854268610477448,
24271
+ "learning_rate": 4.1345775392179654e-05,
24272
+ "loss": 2.0404,
24273
+ "num_input_tokens_seen": 71475094528,
24274
+ "step": 136350
24275
+ },
24276
+ {
24277
+ "epoch": 1.3012652491741894,
24278
+ "grad_norm": 0.11631016433238983,
24279
+ "learning_rate": 4.0236113724274713e-05,
24280
+ "loss": 2.0301,
24281
+ "num_input_tokens_seen": 71501303968,
24282
+ "step": 136400
24283
+ },
24284
+ {
24285
+ "epoch": 1.3017422516903776,
24286
+ "grad_norm": 0.11170602589845657,
24287
+ "learning_rate": 3.9140921685753064e-05,
24288
+ "loss": 2.0431,
24289
+ "num_input_tokens_seen": 71527518368,
24290
+ "step": 136450
24291
+ },
24292
+ {
24293
+ "epoch": 1.302219254206566,
24294
+ "grad_norm": 0.11311063915491104,
24295
+ "learning_rate": 3.806023374435663e-05,
24296
+ "loss": 2.0173,
24297
+ "num_input_tokens_seen": 71553726688,
24298
+ "step": 136500
24299
+ },
24300
+ {
24301
+ "epoch": 1.302219254206566,
24302
+ "eval_loss": 1.9524949789047241,
24303
+ "eval_runtime": 83.0874,
24304
+ "eval_samples_per_second": 60.178,
24305
+ "eval_steps_per_second": 15.044,
24306
+ "num_input_tokens_seen": 71553726688,
24307
+ "step": 136500
24308
+ },
24309
+ {
24310
+ "epoch": 1.3026962567227542,
24311
+ "grad_norm": 0.728589653968811,
24312
+ "learning_rate": 3.699408391135611e-05,
24313
+ "loss": 2.0415,
24314
+ "num_input_tokens_seen": 71579934304,
24315
+ "step": 136550
24316
+ },
24317
+ {
24318
+ "epoch": 1.3031732592389424,
24319
+ "grad_norm": 0.11253057420253754,
24320
+ "learning_rate": 3.594250574048058e-05,
24321
+ "loss": 2.0334,
24322
+ "num_input_tokens_seen": 71606145184,
24323
+ "step": 136600
24324
+ },
24325
+ {
24326
+ "epoch": 1.3036502617551307,
24327
+ "grad_norm": 0.12201691418886185,
24328
+ "learning_rate": 3.4905532326861944e-05,
24329
+ "loss": 2.0403,
24330
+ "num_input_tokens_seen": 71632351648,
24331
+ "step": 136650
24332
+ },
24333
+ {
24334
+ "epoch": 1.304127264271319,
24335
+ "grad_norm": 0.11976749449968338,
24336
+ "learning_rate": 3.3883196305992905e-05,
24337
+ "loss": 2.0292,
24338
+ "num_input_tokens_seen": 71658566048,
24339
+ "step": 136700
24340
+ },
24341
+ {
24342
+ "epoch": 1.3046042667875073,
24343
+ "grad_norm": 0.12131944298744202,
24344
+ "learning_rate": 3.2875529852700146e-05,
24345
+ "loss": 2.0405,
24346
+ "num_input_tokens_seen": 71684775808,
24347
+ "step": 136750
24348
+ },
24349
+ {
24350
+ "epoch": 1.3050812693036955,
24351
+ "grad_norm": 0.11625051498413086,
24352
+ "learning_rate": 3.18825646801314e-05,
24353
+ "loss": 2.0392,
24354
+ "num_input_tokens_seen": 71710990048,
24355
+ "step": 136800
24356
+ },
24357
+ {
24358
+ "epoch": 1.305558271819884,
24359
+ "grad_norm": 0.11870067566633224,
24360
+ "learning_rate": 3.0904332038757974e-05,
24361
+ "loss": 2.0388,
24362
+ "num_input_tokens_seen": 71737198176,
24363
+ "step": 136850
24364
+ },
24365
+ {
24366
+ "epoch": 1.3060352743360721,
24367
+ "grad_norm": 0.11490604281425476,
24368
+ "learning_rate": 2.994086271539048e-05,
24369
+ "loss": 2.0261,
24370
+ "num_input_tokens_seen": 71763409248,
24371
+ "step": 136900
24372
+ },
24373
+ {
24374
+ "epoch": 1.3065122768522603,
24375
+ "grad_norm": 0.1218944787979126,
24376
+ "learning_rate": 2.8992187032210516e-05,
24377
+ "loss": 2.0421,
24378
+ "num_input_tokens_seen": 71789610880,
24379
+ "step": 136950
24380
+ },
24381
+ {
24382
+ "epoch": 1.3069892793684486,
24383
+ "grad_norm": 0.11681609600782394,
24384
+ "learning_rate": 2.8058334845816213e-05,
24385
+ "loss": 2.0287,
24386
+ "num_input_tokens_seen": 71815816608,
24387
+ "step": 137000
24388
+ },
24389
+ {
24390
+ "epoch": 1.3069892793684486,
24391
+ "eval_loss": 1.951898455619812,
24392
+ "eval_runtime": 82.7779,
24393
+ "eval_samples_per_second": 60.403,
24394
+ "eval_steps_per_second": 15.101,
24395
+ "num_input_tokens_seen": 71815816608,
24396
+ "step": 137000
24397
  }
24398
  ],
24399
  "logging_steps": 50,
24400
  "max_steps": 140000,
24401
+ "num_input_tokens_seen": 71815816608,
24402
  "num_train_epochs": 2,
24403
  "save_steps": 1000,
24404
  "stateful_callbacks": {
 
24413
  "attributes": {}
24414
  }
24415
  },
24416
+ "total_flos": 1.271008961912107e+20,
24417
  "train_batch_size": 32,
24418
  "trial_name": null,
24419
  "trial_params": null