mohammadmahdinouri commited on
Commit
9cc549a
·
verified ·
1 Parent(s): 6b73ecc

Training in progress, step 18000, checkpoint

Browse files
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:54574a1a58c678e30a243f8b4a3a0bbe8af33220d14ef8b42b3a78e339bd2289
3
  size 715030586
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3b00afd84b6c9ce17eaf6cde875a1462d2a5f0a7c0b9c73a9b93dfa70356a2e2
3
  size 715030586
last-checkpoint/pytorch_model.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:aa9d36d0137a10cfbdb87e0006e1d6ac58b82282168bdd0821c71eead1bdac32
3
  size 1032262338
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4ebce57f62b5c08e94d3ef4d4c19d6f624921ff13378d5f419a1a0fc63ae8de2
3
  size 1032262338
last-checkpoint/rng_state_0.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b23e565c0773e35ad09f1b2473ae578049f6a7780765ac862ecd6eeeee912c90
3
  size 14960
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:13c1b31558f9530223d30967d940c908110b66ae87767dc8b41640c0ec2ab3ad
3
  size 14960
last-checkpoint/rng_state_1.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:a00867a90484a28803cbe8bd9d8069ef8cdd1a463e5589e32a25c51cb663295b
3
  size 14960
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:31e1f3d55bb567df3a2ebf344a0ee08608b18736ddff2de100218656482b16ab
3
  size 14960
last-checkpoint/rng_state_2.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:16d84e48d7ada91bf975c21c5daad767f717e082ce5b54bad6f761abd9bf7627
3
  size 14960
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b7068584adf4719cad732133ffdff00b498545ab4f7b6d887d675a74b59641e2
3
  size 14960
last-checkpoint/rng_state_3.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d75cbaf389582e1d108d15c09c443d2f4c8941c4b71faeeb3723b25a447f658b
3
  size 14960
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f43ad3e51655951e2a9c021cf9bdd46d25eb6df7a162e3fc18fe50a401173803
3
  size 14960
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:84d957adbd57639a95ced1440a685d29db26c75001a9b3061d2f7af9b9a721b1
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:add33ce1c647f1ad24436fdd2c7095ade5081fad618777000690c7e187278b49
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -2,9 +2,9 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 0.033149548337403904,
6
  "eval_steps": 500,
7
- "global_step": 17000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -11908,6 +11908,706 @@
11908
  "learning_rate": 0.000494636149601328,
11909
  "loss": 17.8281,
11910
  "step": 17000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
11911
  }
11912
  ],
11913
  "logging_steps": 10,
@@ -11927,7 +12627,7 @@
11927
  "attributes": {}
11928
  }
11929
  },
11930
- "total_flos": 3.6732402572894142e+19,
11931
  "train_batch_size": 48,
11932
  "trial_name": null,
11933
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 0.035099521769015894,
6
  "eval_steps": 500,
7
+ "global_step": 18000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
11908
  "learning_rate": 0.000494636149601328,
11909
  "loss": 17.8281,
11910
  "step": 17000
11911
+ },
11912
+ {
11913
+ "epoch": 0.033169048071720025,
11914
+ "grad_norm": 8.75,
11915
+ "learning_rate": 0.0004946328985858733,
11916
+ "loss": 17.885,
11917
+ "step": 17010
11918
+ },
11919
+ {
11920
+ "epoch": 0.03318854780603614,
11921
+ "grad_norm": 8.375,
11922
+ "learning_rate": 0.0004946296475704186,
11923
+ "loss": 17.946,
11924
+ "step": 17020
11925
+ },
11926
+ {
11927
+ "epoch": 0.03320804754035226,
11928
+ "grad_norm": 8.5,
11929
+ "learning_rate": 0.000494626396554964,
11930
+ "loss": 17.7649,
11931
+ "step": 17030
11932
+ },
11933
+ {
11934
+ "epoch": 0.03322754727466838,
11935
+ "grad_norm": 9.5,
11936
+ "learning_rate": 0.0004946231455395093,
11937
+ "loss": 17.8435,
11938
+ "step": 17040
11939
+ },
11940
+ {
11941
+ "epoch": 0.0332470470089845,
11942
+ "grad_norm": 22.375,
11943
+ "learning_rate": 0.0004946198945240546,
11944
+ "loss": 17.7794,
11945
+ "step": 17050
11946
+ },
11947
+ {
11948
+ "epoch": 0.03326654674330062,
11949
+ "grad_norm": 8.75,
11950
+ "learning_rate": 0.0004946166435085999,
11951
+ "loss": 17.7582,
11952
+ "step": 17060
11953
+ },
11954
+ {
11955
+ "epoch": 0.033286046477616744,
11956
+ "grad_norm": 8.4375,
11957
+ "learning_rate": 0.0004946133924931453,
11958
+ "loss": 17.9408,
11959
+ "step": 17070
11960
+ },
11961
+ {
11962
+ "epoch": 0.033305546211932865,
11963
+ "grad_norm": 8.1875,
11964
+ "learning_rate": 0.0004946101414776906,
11965
+ "loss": 17.8288,
11966
+ "step": 17080
11967
+ },
11968
+ {
11969
+ "epoch": 0.03332504594624898,
11970
+ "grad_norm": 8.1875,
11971
+ "learning_rate": 0.0004946068904622359,
11972
+ "loss": 17.8074,
11973
+ "step": 17090
11974
+ },
11975
+ {
11976
+ "epoch": 0.0333445456805651,
11977
+ "grad_norm": 11.5,
11978
+ "learning_rate": 0.0004946036394467813,
11979
+ "loss": 17.7826,
11980
+ "step": 17100
11981
+ },
11982
+ {
11983
+ "epoch": 0.03336404541488122,
11984
+ "grad_norm": 8.25,
11985
+ "learning_rate": 0.0004946003884313266,
11986
+ "loss": 17.8136,
11987
+ "step": 17110
11988
+ },
11989
+ {
11990
+ "epoch": 0.03338354514919734,
11991
+ "grad_norm": 7.90625,
11992
+ "learning_rate": 0.0004945971374158719,
11993
+ "loss": 17.7152,
11994
+ "step": 17120
11995
+ },
11996
+ {
11997
+ "epoch": 0.033403044883513464,
11998
+ "grad_norm": 9.1875,
11999
+ "learning_rate": 0.0004945938864004172,
12000
+ "loss": 17.6264,
12001
+ "step": 17130
12002
+ },
12003
+ {
12004
+ "epoch": 0.033422544617829585,
12005
+ "grad_norm": 8.875,
12006
+ "learning_rate": 0.0004945906353849625,
12007
+ "loss": 17.6933,
12008
+ "step": 17140
12009
+ },
12010
+ {
12011
+ "epoch": 0.0334420443521457,
12012
+ "grad_norm": 9.0625,
12013
+ "learning_rate": 0.0004945873843695078,
12014
+ "loss": 17.7102,
12015
+ "step": 17150
12016
+ },
12017
+ {
12018
+ "epoch": 0.03346154408646182,
12019
+ "grad_norm": 8.3125,
12020
+ "learning_rate": 0.0004945841333540531,
12021
+ "loss": 17.7096,
12022
+ "step": 17160
12023
+ },
12024
+ {
12025
+ "epoch": 0.03348104382077794,
12026
+ "grad_norm": 7.9375,
12027
+ "learning_rate": 0.0004945808823385984,
12028
+ "loss": 17.5945,
12029
+ "step": 17170
12030
+ },
12031
+ {
12032
+ "epoch": 0.03350054355509406,
12033
+ "grad_norm": 9.5625,
12034
+ "learning_rate": 0.0004945776313231438,
12035
+ "loss": 17.7132,
12036
+ "step": 17180
12037
+ },
12038
+ {
12039
+ "epoch": 0.03352004328941018,
12040
+ "grad_norm": 8.625,
12041
+ "learning_rate": 0.0004945743803076891,
12042
+ "loss": 17.8003,
12043
+ "step": 17190
12044
+ },
12045
+ {
12046
+ "epoch": 0.033539543023726304,
12047
+ "grad_norm": 9.4375,
12048
+ "learning_rate": 0.0004945711292922344,
12049
+ "loss": 17.8819,
12050
+ "step": 17200
12051
+ },
12052
+ {
12053
+ "epoch": 0.033559042758042425,
12054
+ "grad_norm": 11.8125,
12055
+ "learning_rate": 0.0004945678782767798,
12056
+ "loss": 17.7493,
12057
+ "step": 17210
12058
+ },
12059
+ {
12060
+ "epoch": 0.03357854249235854,
12061
+ "grad_norm": 8.9375,
12062
+ "learning_rate": 0.0004945646272613251,
12063
+ "loss": 17.8033,
12064
+ "step": 17220
12065
+ },
12066
+ {
12067
+ "epoch": 0.03359804222667466,
12068
+ "grad_norm": 8.25,
12069
+ "learning_rate": 0.0004945613762458704,
12070
+ "loss": 17.7948,
12071
+ "step": 17230
12072
+ },
12073
+ {
12074
+ "epoch": 0.03361754196099078,
12075
+ "grad_norm": 8.875,
12076
+ "learning_rate": 0.0004945581252304157,
12077
+ "loss": 17.6432,
12078
+ "step": 17240
12079
+ },
12080
+ {
12081
+ "epoch": 0.0336370416953069,
12082
+ "grad_norm": 8.125,
12083
+ "learning_rate": 0.0004945548742149611,
12084
+ "loss": 17.7967,
12085
+ "step": 17250
12086
+ },
12087
+ {
12088
+ "epoch": 0.03365654142962302,
12089
+ "grad_norm": 9.4375,
12090
+ "learning_rate": 0.0004945516231995064,
12091
+ "loss": 17.7517,
12092
+ "step": 17260
12093
+ },
12094
+ {
12095
+ "epoch": 0.033676041163939144,
12096
+ "grad_norm": 7.15625,
12097
+ "learning_rate": 0.0004945483721840517,
12098
+ "loss": 17.8602,
12099
+ "step": 17270
12100
+ },
12101
+ {
12102
+ "epoch": 0.03369554089825526,
12103
+ "grad_norm": 8.3125,
12104
+ "learning_rate": 0.0004945451211685971,
12105
+ "loss": 17.7164,
12106
+ "step": 17280
12107
+ },
12108
+ {
12109
+ "epoch": 0.03371504063257138,
12110
+ "grad_norm": 8.9375,
12111
+ "learning_rate": 0.0004945418701531423,
12112
+ "loss": 17.6317,
12113
+ "step": 17290
12114
+ },
12115
+ {
12116
+ "epoch": 0.0337345403668875,
12117
+ "grad_norm": 10.5625,
12118
+ "learning_rate": 0.0004945386191376876,
12119
+ "loss": 17.71,
12120
+ "step": 17300
12121
+ },
12122
+ {
12123
+ "epoch": 0.03375404010120362,
12124
+ "grad_norm": 8.125,
12125
+ "learning_rate": 0.0004945353681222329,
12126
+ "loss": 17.7735,
12127
+ "step": 17310
12128
+ },
12129
+ {
12130
+ "epoch": 0.03377353983551974,
12131
+ "grad_norm": 8.5,
12132
+ "learning_rate": 0.0004945321171067783,
12133
+ "loss": 17.8358,
12134
+ "step": 17320
12135
+ },
12136
+ {
12137
+ "epoch": 0.03379303956983586,
12138
+ "grad_norm": 8.9375,
12139
+ "learning_rate": 0.0004945288660913236,
12140
+ "loss": 17.6823,
12141
+ "step": 17330
12142
+ },
12143
+ {
12144
+ "epoch": 0.033812539304151984,
12145
+ "grad_norm": 7.90625,
12146
+ "learning_rate": 0.0004945256150758689,
12147
+ "loss": 17.6095,
12148
+ "step": 17340
12149
+ },
12150
+ {
12151
+ "epoch": 0.0338320390384681,
12152
+ "grad_norm": 8.625,
12153
+ "learning_rate": 0.0004945223640604143,
12154
+ "loss": 17.7989,
12155
+ "step": 17350
12156
+ },
12157
+ {
12158
+ "epoch": 0.03385153877278422,
12159
+ "grad_norm": 9.25,
12160
+ "learning_rate": 0.0004945191130449596,
12161
+ "loss": 17.8012,
12162
+ "step": 17360
12163
+ },
12164
+ {
12165
+ "epoch": 0.03387103850710034,
12166
+ "grad_norm": 8.5625,
12167
+ "learning_rate": 0.0004945158620295049,
12168
+ "loss": 17.701,
12169
+ "step": 17370
12170
+ },
12171
+ {
12172
+ "epoch": 0.03389053824141646,
12173
+ "grad_norm": 9.1875,
12174
+ "learning_rate": 0.0004945126110140502,
12175
+ "loss": 17.6966,
12176
+ "step": 17380
12177
+ },
12178
+ {
12179
+ "epoch": 0.03391003797573258,
12180
+ "grad_norm": 8.75,
12181
+ "learning_rate": 0.0004945093599985956,
12182
+ "loss": 17.7201,
12183
+ "step": 17390
12184
+ },
12185
+ {
12186
+ "epoch": 0.0339295377100487,
12187
+ "grad_norm": 8.0,
12188
+ "learning_rate": 0.0004945061089831409,
12189
+ "loss": 17.7151,
12190
+ "step": 17400
12191
+ },
12192
+ {
12193
+ "epoch": 0.03394903744436482,
12194
+ "grad_norm": 8.8125,
12195
+ "learning_rate": 0.0004945028579676862,
12196
+ "loss": 17.5905,
12197
+ "step": 17410
12198
+ },
12199
+ {
12200
+ "epoch": 0.03396853717868094,
12201
+ "grad_norm": 9.1875,
12202
+ "learning_rate": 0.0004944996069522316,
12203
+ "loss": 17.7166,
12204
+ "step": 17420
12205
+ },
12206
+ {
12207
+ "epoch": 0.03398803691299706,
12208
+ "grad_norm": 8.625,
12209
+ "learning_rate": 0.0004944963559367769,
12210
+ "loss": 17.7541,
12211
+ "step": 17430
12212
+ },
12213
+ {
12214
+ "epoch": 0.03400753664731318,
12215
+ "grad_norm": 8.1875,
12216
+ "learning_rate": 0.0004944931049213222,
12217
+ "loss": 17.6131,
12218
+ "step": 17440
12219
+ },
12220
+ {
12221
+ "epoch": 0.0340270363816293,
12222
+ "grad_norm": 8.4375,
12223
+ "learning_rate": 0.0004944898539058674,
12224
+ "loss": 17.6559,
12225
+ "step": 17450
12226
+ },
12227
+ {
12228
+ "epoch": 0.03404653611594542,
12229
+ "grad_norm": 8.0625,
12230
+ "learning_rate": 0.0004944866028904128,
12231
+ "loss": 17.7009,
12232
+ "step": 17460
12233
+ },
12234
+ {
12235
+ "epoch": 0.03406603585026154,
12236
+ "grad_norm": 7.78125,
12237
+ "learning_rate": 0.0004944833518749581,
12238
+ "loss": 17.7258,
12239
+ "step": 17470
12240
+ },
12241
+ {
12242
+ "epoch": 0.03408553558457766,
12243
+ "grad_norm": 7.75,
12244
+ "learning_rate": 0.0004944801008595034,
12245
+ "loss": 17.687,
12246
+ "step": 17480
12247
+ },
12248
+ {
12249
+ "epoch": 0.03410503531889378,
12250
+ "grad_norm": 9.1875,
12251
+ "learning_rate": 0.0004944768498440487,
12252
+ "loss": 17.6723,
12253
+ "step": 17490
12254
+ },
12255
+ {
12256
+ "epoch": 0.0341245350532099,
12257
+ "grad_norm": 8.0625,
12258
+ "learning_rate": 0.0004944735988285941,
12259
+ "loss": 17.6988,
12260
+ "step": 17500
12261
+ },
12262
+ {
12263
+ "epoch": 0.03414403478752602,
12264
+ "grad_norm": 8.875,
12265
+ "learning_rate": 0.0004944703478131394,
12266
+ "loss": 17.8097,
12267
+ "step": 17510
12268
+ },
12269
+ {
12270
+ "epoch": 0.03416353452184214,
12271
+ "grad_norm": 9.0625,
12272
+ "learning_rate": 0.0004944670967976847,
12273
+ "loss": 17.7569,
12274
+ "step": 17520
12275
+ },
12276
+ {
12277
+ "epoch": 0.03418303425615826,
12278
+ "grad_norm": 7.6875,
12279
+ "learning_rate": 0.0004944638457822301,
12280
+ "loss": 17.8112,
12281
+ "step": 17530
12282
+ },
12283
+ {
12284
+ "epoch": 0.034202533990474376,
12285
+ "grad_norm": 8.3125,
12286
+ "learning_rate": 0.0004944605947667754,
12287
+ "loss": 17.7088,
12288
+ "step": 17540
12289
+ },
12290
+ {
12291
+ "epoch": 0.0342220337247905,
12292
+ "grad_norm": 52.75,
12293
+ "learning_rate": 0.0004944573437513207,
12294
+ "loss": 17.743,
12295
+ "step": 17550
12296
+ },
12297
+ {
12298
+ "epoch": 0.03424153345910662,
12299
+ "grad_norm": 7.28125,
12300
+ "learning_rate": 0.000494454092735866,
12301
+ "loss": 17.5971,
12302
+ "step": 17560
12303
+ },
12304
+ {
12305
+ "epoch": 0.03426103319342274,
12306
+ "grad_norm": 8.8125,
12307
+ "learning_rate": 0.0004944508417204114,
12308
+ "loss": 17.5989,
12309
+ "step": 17570
12310
+ },
12311
+ {
12312
+ "epoch": 0.03428053292773886,
12313
+ "grad_norm": 7.71875,
12314
+ "learning_rate": 0.0004944475907049567,
12315
+ "loss": 17.6487,
12316
+ "step": 17580
12317
+ },
12318
+ {
12319
+ "epoch": 0.03430003266205498,
12320
+ "grad_norm": 8.0625,
12321
+ "learning_rate": 0.000494444339689502,
12322
+ "loss": 17.7165,
12323
+ "step": 17590
12324
+ },
12325
+ {
12326
+ "epoch": 0.0343195323963711,
12327
+ "grad_norm": 8.1875,
12328
+ "learning_rate": 0.0004944410886740474,
12329
+ "loss": 17.6039,
12330
+ "step": 17600
12331
+ },
12332
+ {
12333
+ "epoch": 0.034339032130687216,
12334
+ "grad_norm": 9.0,
12335
+ "learning_rate": 0.0004944378376585927,
12336
+ "loss": 17.701,
12337
+ "step": 17610
12338
+ },
12339
+ {
12340
+ "epoch": 0.03435853186500334,
12341
+ "grad_norm": 9.0625,
12342
+ "learning_rate": 0.000494434586643138,
12343
+ "loss": 17.7048,
12344
+ "step": 17620
12345
+ },
12346
+ {
12347
+ "epoch": 0.03437803159931946,
12348
+ "grad_norm": 9.5,
12349
+ "learning_rate": 0.0004944313356276833,
12350
+ "loss": 17.7427,
12351
+ "step": 17630
12352
+ },
12353
+ {
12354
+ "epoch": 0.03439753133363558,
12355
+ "grad_norm": 8.5,
12356
+ "learning_rate": 0.0004944280846122287,
12357
+ "loss": 17.6594,
12358
+ "step": 17640
12359
+ },
12360
+ {
12361
+ "epoch": 0.0344170310679517,
12362
+ "grad_norm": 9.5625,
12363
+ "learning_rate": 0.000494424833596774,
12364
+ "loss": 17.7552,
12365
+ "step": 17650
12366
+ },
12367
+ {
12368
+ "epoch": 0.03443653080226782,
12369
+ "grad_norm": 8.25,
12370
+ "learning_rate": 0.0004944215825813193,
12371
+ "loss": 17.6453,
12372
+ "step": 17660
12373
+ },
12374
+ {
12375
+ "epoch": 0.034456030536583936,
12376
+ "grad_norm": 7.40625,
12377
+ "learning_rate": 0.0004944183315658647,
12378
+ "loss": 17.8145,
12379
+ "step": 17670
12380
+ },
12381
+ {
12382
+ "epoch": 0.03447553027090006,
12383
+ "grad_norm": 9.5,
12384
+ "learning_rate": 0.00049441508055041,
12385
+ "loss": 17.6461,
12386
+ "step": 17680
12387
+ },
12388
+ {
12389
+ "epoch": 0.03449503000521618,
12390
+ "grad_norm": 9.5625,
12391
+ "learning_rate": 0.0004944118295349552,
12392
+ "loss": 17.7198,
12393
+ "step": 17690
12394
+ },
12395
+ {
12396
+ "epoch": 0.0345145297395323,
12397
+ "grad_norm": 8.5625,
12398
+ "learning_rate": 0.0004944085785195005,
12399
+ "loss": 17.7119,
12400
+ "step": 17700
12401
+ },
12402
+ {
12403
+ "epoch": 0.03453402947384842,
12404
+ "grad_norm": 8.0625,
12405
+ "learning_rate": 0.0004944053275040459,
12406
+ "loss": 17.653,
12407
+ "step": 17710
12408
+ },
12409
+ {
12410
+ "epoch": 0.03455352920816454,
12411
+ "grad_norm": 9.125,
12412
+ "learning_rate": 0.0004944020764885912,
12413
+ "loss": 17.7956,
12414
+ "step": 17720
12415
+ },
12416
+ {
12417
+ "epoch": 0.03457302894248066,
12418
+ "grad_norm": 8.3125,
12419
+ "learning_rate": 0.0004943988254731365,
12420
+ "loss": 17.7212,
12421
+ "step": 17730
12422
+ },
12423
+ {
12424
+ "epoch": 0.034592528676796776,
12425
+ "grad_norm": 8.125,
12426
+ "learning_rate": 0.0004943955744576818,
12427
+ "loss": 17.7634,
12428
+ "step": 17740
12429
+ },
12430
+ {
12431
+ "epoch": 0.0346120284111129,
12432
+ "grad_norm": 8.25,
12433
+ "learning_rate": 0.0004943923234422272,
12434
+ "loss": 17.6751,
12435
+ "step": 17750
12436
+ },
12437
+ {
12438
+ "epoch": 0.03463152814542902,
12439
+ "grad_norm": 9.0,
12440
+ "learning_rate": 0.0004943890724267725,
12441
+ "loss": 17.6671,
12442
+ "step": 17760
12443
+ },
12444
+ {
12445
+ "epoch": 0.03465102787974514,
12446
+ "grad_norm": 8.6875,
12447
+ "learning_rate": 0.0004943858214113178,
12448
+ "loss": 17.73,
12449
+ "step": 17770
12450
+ },
12451
+ {
12452
+ "epoch": 0.03467052761406126,
12453
+ "grad_norm": 9.5625,
12454
+ "learning_rate": 0.0004943825703958632,
12455
+ "loss": 17.713,
12456
+ "step": 17780
12457
+ },
12458
+ {
12459
+ "epoch": 0.03469002734837738,
12460
+ "grad_norm": 8.8125,
12461
+ "learning_rate": 0.0004943793193804085,
12462
+ "loss": 17.6888,
12463
+ "step": 17790
12464
+ },
12465
+ {
12466
+ "epoch": 0.034709527082693495,
12467
+ "grad_norm": 8.1875,
12468
+ "learning_rate": 0.0004943760683649538,
12469
+ "loss": 17.6683,
12470
+ "step": 17800
12471
+ },
12472
+ {
12473
+ "epoch": 0.034729026817009616,
12474
+ "grad_norm": 8.3125,
12475
+ "learning_rate": 0.0004943728173494991,
12476
+ "loss": 17.6479,
12477
+ "step": 17810
12478
+ },
12479
+ {
12480
+ "epoch": 0.03474852655132574,
12481
+ "grad_norm": 8.8125,
12482
+ "learning_rate": 0.0004943695663340445,
12483
+ "loss": 17.7058,
12484
+ "step": 17820
12485
+ },
12486
+ {
12487
+ "epoch": 0.03476802628564186,
12488
+ "grad_norm": 9.625,
12489
+ "learning_rate": 0.0004943663153185898,
12490
+ "loss": 17.6571,
12491
+ "step": 17830
12492
+ },
12493
+ {
12494
+ "epoch": 0.03478752601995798,
12495
+ "grad_norm": 11.0625,
12496
+ "learning_rate": 0.0004943630643031351,
12497
+ "loss": 17.6084,
12498
+ "step": 17840
12499
+ },
12500
+ {
12501
+ "epoch": 0.0348070257542741,
12502
+ "grad_norm": 8.875,
12503
+ "learning_rate": 0.0004943598132876805,
12504
+ "loss": 17.6829,
12505
+ "step": 17850
12506
+ },
12507
+ {
12508
+ "epoch": 0.03482652548859022,
12509
+ "grad_norm": 10.4375,
12510
+ "learning_rate": 0.0004943565622722258,
12511
+ "loss": 17.6978,
12512
+ "step": 17860
12513
+ },
12514
+ {
12515
+ "epoch": 0.034846025222906335,
12516
+ "grad_norm": 8.1875,
12517
+ "learning_rate": 0.0004943533112567711,
12518
+ "loss": 17.5957,
12519
+ "step": 17870
12520
+ },
12521
+ {
12522
+ "epoch": 0.034865524957222456,
12523
+ "grad_norm": 8.125,
12524
+ "learning_rate": 0.0004943500602413164,
12525
+ "loss": 17.6825,
12526
+ "step": 17880
12527
+ },
12528
+ {
12529
+ "epoch": 0.03488502469153858,
12530
+ "grad_norm": 8.25,
12531
+ "learning_rate": 0.0004943468092258618,
12532
+ "loss": 17.5999,
12533
+ "step": 17890
12534
+ },
12535
+ {
12536
+ "epoch": 0.0349045244258547,
12537
+ "grad_norm": 8.5,
12538
+ "learning_rate": 0.0004943435582104071,
12539
+ "loss": 17.5818,
12540
+ "step": 17900
12541
+ },
12542
+ {
12543
+ "epoch": 0.03492402416017082,
12544
+ "grad_norm": 8.875,
12545
+ "learning_rate": 0.0004943403071949523,
12546
+ "loss": 17.5646,
12547
+ "step": 17910
12548
+ },
12549
+ {
12550
+ "epoch": 0.03494352389448694,
12551
+ "grad_norm": 7.875,
12552
+ "learning_rate": 0.0004943370561794977,
12553
+ "loss": 17.7599,
12554
+ "step": 17920
12555
+ },
12556
+ {
12557
+ "epoch": 0.03496302362880306,
12558
+ "grad_norm": 9.875,
12559
+ "learning_rate": 0.000494333805164043,
12560
+ "loss": 17.7103,
12561
+ "step": 17930
12562
+ },
12563
+ {
12564
+ "epoch": 0.034982523363119175,
12565
+ "grad_norm": 10.1875,
12566
+ "learning_rate": 0.0004943305541485883,
12567
+ "loss": 17.5457,
12568
+ "step": 17940
12569
+ },
12570
+ {
12571
+ "epoch": 0.035002023097435296,
12572
+ "grad_norm": 7.59375,
12573
+ "learning_rate": 0.0004943273031331336,
12574
+ "loss": 17.6915,
12575
+ "step": 17950
12576
+ },
12577
+ {
12578
+ "epoch": 0.03502152283175142,
12579
+ "grad_norm": 8.5,
12580
+ "learning_rate": 0.000494324052117679,
12581
+ "loss": 17.5628,
12582
+ "step": 17960
12583
+ },
12584
+ {
12585
+ "epoch": 0.03504102256606754,
12586
+ "grad_norm": 8.1875,
12587
+ "learning_rate": 0.0004943208011022243,
12588
+ "loss": 17.7019,
12589
+ "step": 17970
12590
+ },
12591
+ {
12592
+ "epoch": 0.03506052230038366,
12593
+ "grad_norm": 8.0,
12594
+ "learning_rate": 0.0004943175500867696,
12595
+ "loss": 17.7066,
12596
+ "step": 17980
12597
+ },
12598
+ {
12599
+ "epoch": 0.03508002203469978,
12600
+ "grad_norm": 8.5,
12601
+ "learning_rate": 0.000494314299071315,
12602
+ "loss": 17.5608,
12603
+ "step": 17990
12604
+ },
12605
+ {
12606
+ "epoch": 0.035099521769015894,
12607
+ "grad_norm": 7.46875,
12608
+ "learning_rate": 0.0004943110480558603,
12609
+ "loss": 17.5819,
12610
+ "step": 18000
12611
  }
12612
  ],
12613
  "logging_steps": 10,
 
12627
  "attributes": {}
12628
  }
12629
  },
12630
+ "total_flos": 3.889326067389196e+19,
12631
  "train_batch_size": 48,
12632
  "trial_name": null,
12633
  "trial_params": null