Bingsu commited on
Commit
73b22f5
1 Parent(s): 6130161

Training in progress, step 110000

Browse files
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:0978bc6ba8af6107f37b1ce0de2823f6ffed0e6f3357ff497f2e879a2e834ef3
3
  size 586828837
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:66a5ef622dbe8d56e7b956a8e46b2dc79895c2e55948ea096cb19b5071a69a55
3
  size 586828837
last-checkpoint/pytorch_model.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:dbd29e5163387c30aab8a8890d9ee62efc996b56b102107768b0eabf5e23817e
3
  size 146774203
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d4ef7a37dd78952cbb4fc37873637ea684f070d1e44bde74352b791d0c661cd4
3
  size 146774203
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:6c1dca87fa841c98848654f4170210b8999092daa8068e42b3812f3ab2a9ca99
3
  size 14503
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e483b397451e40991ea9aa448be1d4addf388b682e3c225fd31dfc476ba76800
3
  size 14503
last-checkpoint/scaler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:43c91aaff4049dd76fbb2e0bcf40a0522c406dbf03765ebea50f6fb1be9645c2
3
  size 559
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:42dbbdc5cc13bd878c323aa5fdc58a0d0d735dc686a8a6a2c1206cf27279185b
3
  size 559
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:9fc0cfe80fceb3705f7126b83f3fe0e36d87dafa6df093df20b056316ba4fd28
3
  size 733555848
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:645860789a54b6c41f5c11e77f6f0318ed7d04c0720af434b0a590d8b30f84c0
3
  size 733555848
last-checkpoint/trainer_state.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 0.42972493307034165,
5
- "global_step": 100000,
6
  "is_hyper_param_search": false,
7
  "is_local_process_zero": true,
8
  "is_world_process_zero": true,
@@ -3006,11 +3006,311 @@
3006
  "learning_rate": 0.0027978817870494,
3007
  "loss": 8.5906,
3008
  "step": 100000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3009
  }
3010
  ],
3011
  "max_steps": 1000000,
3012
  "num_train_epochs": 5,
3013
- "total_flos": 1.593829982208e+17,
3014
  "trial_name": null,
3015
  "trial_params": null
3016
  }
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 0.47269742637737583,
5
+ "global_step": 110000,
6
  "is_hyper_param_search": false,
7
  "is_local_process_zero": true,
8
  "is_world_process_zero": true,
 
3006
  "learning_rate": 0.0027978817870494,
3007
  "loss": 8.5906,
3008
  "step": 100000
3009
+ },
3010
+ {
3011
+ "epoch": 0.43,
3012
+ "learning_rate": 0.0028065907522651585,
3013
+ "loss": 8.5938,
3014
+ "step": 100200
3015
+ },
3016
+ {
3017
+ "epoch": 0.43,
3018
+ "learning_rate": 0.002815310216214826,
3019
+ "loss": 8.5887,
3020
+ "step": 100400
3021
+ },
3022
+ {
3023
+ "epoch": 0.43,
3024
+ "learning_rate": 0.00282404014065031,
3025
+ "loss": 8.5922,
3026
+ "step": 100600
3027
+ },
3028
+ {
3029
+ "epoch": 0.43,
3030
+ "learning_rate": 0.0028327804872776367,
3031
+ "loss": 8.5926,
3032
+ "step": 100800
3033
+ },
3034
+ {
3035
+ "epoch": 0.43,
3036
+ "learning_rate": 0.002841531217757113,
3037
+ "loss": 8.5978,
3038
+ "step": 101000
3039
+ },
3040
+ {
3041
+ "epoch": 0.43,
3042
+ "learning_rate": 0.0028502922937035,
3043
+ "loss": 8.5984,
3044
+ "step": 101200
3045
+ },
3046
+ {
3047
+ "epoch": 0.44,
3048
+ "learning_rate": 0.0028590636766861726,
3049
+ "loss": 8.6046,
3050
+ "step": 101400
3051
+ },
3052
+ {
3053
+ "epoch": 0.44,
3054
+ "learning_rate": 0.0028678453282293013,
3055
+ "loss": 8.6093,
3056
+ "step": 101600
3057
+ },
3058
+ {
3059
+ "epoch": 0.44,
3060
+ "learning_rate": 0.0028766372098120076,
3061
+ "loss": 8.6083,
3062
+ "step": 101800
3063
+ },
3064
+ {
3065
+ "epoch": 0.44,
3066
+ "learning_rate": 0.0028854392828685377,
3067
+ "loss": 8.6057,
3068
+ "step": 102000
3069
+ },
3070
+ {
3071
+ "epoch": 0.44,
3072
+ "learning_rate": 0.0028942515087884407,
3073
+ "loss": 8.6146,
3074
+ "step": 102200
3075
+ },
3076
+ {
3077
+ "epoch": 0.44,
3078
+ "learning_rate": 0.00290307384891672,
3079
+ "loss": 8.608,
3080
+ "step": 102400
3081
+ },
3082
+ {
3083
+ "epoch": 0.44,
3084
+ "learning_rate": 0.00291190626455402,
3085
+ "loss": 8.6081,
3086
+ "step": 102600
3087
+ },
3088
+ {
3089
+ "epoch": 0.44,
3090
+ "learning_rate": 0.0029207044797924615,
3091
+ "loss": 8.6164,
3092
+ "step": 102800
3093
+ },
3094
+ {
3095
+ "epoch": 0.44,
3096
+ "learning_rate": 0.0029295568802797795,
3097
+ "loss": 8.6008,
3098
+ "step": 103000
3099
+ },
3100
+ {
3101
+ "epoch": 0.44,
3102
+ "learning_rate": 0.0029384192401078115,
3103
+ "loss": 8.6166,
3104
+ "step": 103200
3105
+ },
3106
+ {
3107
+ "epoch": 0.44,
3108
+ "learning_rate": 0.00294729152040165,
3109
+ "loss": 8.5962,
3110
+ "step": 103400
3111
+ },
3112
+ {
3113
+ "epoch": 0.45,
3114
+ "learning_rate": 0.002956173682242877,
3115
+ "loss": 8.6129,
3116
+ "step": 103600
3117
+ },
3118
+ {
3119
+ "epoch": 0.45,
3120
+ "learning_rate": 0.002965065686669722,
3121
+ "loss": 8.6092,
3122
+ "step": 103800
3123
+ },
3124
+ {
3125
+ "epoch": 0.45,
3126
+ "learning_rate": 0.0029739674946772463,
3127
+ "loss": 8.6189,
3128
+ "step": 104000
3129
+ },
3130
+ {
3131
+ "epoch": 0.45,
3132
+ "learning_rate": 0.002982879067217503,
3133
+ "loss": 8.612,
3134
+ "step": 104200
3135
+ },
3136
+ {
3137
+ "epoch": 0.45,
3138
+ "learning_rate": 0.0029918003651997144,
3139
+ "loss": 8.6135,
3140
+ "step": 104400
3141
+ },
3142
+ {
3143
+ "epoch": 0.45,
3144
+ "learning_rate": 0.003000731349490442,
3145
+ "loss": 8.6182,
3146
+ "step": 104600
3147
+ },
3148
+ {
3149
+ "epoch": 0.45,
3150
+ "learning_rate": 0.0030096719809137584,
3151
+ "loss": 8.6423,
3152
+ "step": 104800
3153
+ },
3154
+ {
3155
+ "epoch": 0.45,
3156
+ "learning_rate": 0.003018622220251419,
3157
+ "loss": 8.6145,
3158
+ "step": 105000
3159
+ },
3160
+ {
3161
+ "epoch": 0.45,
3162
+ "learning_rate": 0.0030275372054660438,
3163
+ "loss": 8.6249,
3164
+ "step": 105200
3165
+ },
3166
+ {
3167
+ "epoch": 0.45,
3168
+ "learning_rate": 0.0030365064952603237,
3169
+ "loss": 8.6265,
3170
+ "step": 105400
3171
+ },
3172
+ {
3173
+ "epoch": 0.45,
3174
+ "learning_rate": 0.0030454852752588536,
3175
+ "loss": 8.6304,
3176
+ "step": 105600
3177
+ },
3178
+ {
3179
+ "epoch": 0.45,
3180
+ "learning_rate": 0.0030544735060760494,
3181
+ "loss": 8.6309,
3182
+ "step": 105800
3183
+ },
3184
+ {
3185
+ "epoch": 0.46,
3186
+ "learning_rate": 0.0030634711482848704,
3187
+ "loss": 8.6258,
3188
+ "step": 106000
3189
+ },
3190
+ {
3191
+ "epoch": 0.46,
3192
+ "learning_rate": 0.003072478162416994,
3193
+ "loss": 8.6328,
3194
+ "step": 106200
3195
+ },
3196
+ {
3197
+ "epoch": 0.46,
3198
+ "learning_rate": 0.003081494508962985,
3199
+ "loss": 8.6298,
3200
+ "step": 106400
3201
+ },
3202
+ {
3203
+ "epoch": 0.46,
3204
+ "learning_rate": 0.0030905201483724717,
3205
+ "loss": 8.639,
3206
+ "step": 106600
3207
+ },
3208
+ {
3209
+ "epoch": 0.46,
3210
+ "learning_rate": 0.0030995550410543226,
3211
+ "loss": 8.6212,
3212
+ "step": 106800
3213
+ },
3214
+ {
3215
+ "epoch": 0.46,
3216
+ "learning_rate": 0.0031085991473768114,
3217
+ "loss": 8.6374,
3218
+ "step": 107000
3219
+ },
3220
+ {
3221
+ "epoch": 0.46,
3222
+ "learning_rate": 0.003117652427667799,
3223
+ "loss": 8.6326,
3224
+ "step": 107200
3225
+ },
3226
+ {
3227
+ "epoch": 0.46,
3228
+ "learning_rate": 0.0031267148422149046,
3229
+ "loss": 8.6291,
3230
+ "step": 107400
3231
+ },
3232
+ {
3233
+ "epoch": 0.46,
3234
+ "learning_rate": 0.003135740971163656,
3235
+ "loss": 8.6375,
3236
+ "step": 107600
3237
+ },
3238
+ {
3239
+ "epoch": 0.46,
3240
+ "learning_rate": 0.0031448214897512507,
3241
+ "loss": 8.6226,
3242
+ "step": 107800
3243
+ },
3244
+ {
3245
+ "epoch": 0.46,
3246
+ "learning_rate": 0.003153911023417371,
3247
+ "loss": 8.6359,
3248
+ "step": 108000
3249
+ },
3250
+ {
3251
+ "epoch": 0.46,
3252
+ "learning_rate": 0.003163009532290608,
3253
+ "loss": 8.6491,
3254
+ "step": 108200
3255
+ },
3256
+ {
3257
+ "epoch": 0.47,
3258
+ "learning_rate": 0.0031721169764601844,
3259
+ "loss": 8.6405,
3260
+ "step": 108400
3261
+ },
3262
+ {
3263
+ "epoch": 0.47,
3264
+ "learning_rate": 0.0031812333159761293,
3265
+ "loss": 8.632,
3266
+ "step": 108600
3267
+ },
3268
+ {
3269
+ "epoch": 0.47,
3270
+ "learning_rate": 0.003190358510849451,
3271
+ "loss": 8.6363,
3272
+ "step": 108800
3273
+ },
3274
+ {
3275
+ "epoch": 0.47,
3276
+ "learning_rate": 0.0031994925210523124,
3277
+ "loss": 8.6316,
3278
+ "step": 109000
3279
+ },
3280
+ {
3281
+ "epoch": 0.47,
3282
+ "learning_rate": 0.0032086353065182106,
3283
+ "loss": 8.6423,
3284
+ "step": 109200
3285
+ },
3286
+ {
3287
+ "epoch": 0.47,
3288
+ "learning_rate": 0.003217786827142146,
3289
+ "loss": 8.6274,
3290
+ "step": 109400
3291
+ },
3292
+ {
3293
+ "epoch": 0.47,
3294
+ "learning_rate": 0.003226947042780804,
3295
+ "loss": 8.6366,
3296
+ "step": 109600
3297
+ },
3298
+ {
3299
+ "epoch": 0.47,
3300
+ "learning_rate": 0.003236070047437989,
3301
+ "loss": 8.6388,
3302
+ "step": 109800
3303
+ },
3304
+ {
3305
+ "epoch": 0.47,
3306
+ "learning_rate": 0.003245247489550804,
3307
+ "loss": 8.6364,
3308
+ "step": 110000
3309
  }
3310
  ],
3311
  "max_steps": 1000000,
3312
  "num_train_epochs": 5,
3313
+ "total_flos": 1.7532129804288e+17,
3314
  "trial_name": null,
3315
  "trial_params": null
3316
  }
pytorch_model.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:dbd29e5163387c30aab8a8890d9ee62efc996b56b102107768b0eabf5e23817e
3
  size 146774203
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d4ef7a37dd78952cbb4fc37873637ea684f070d1e44bde74352b791d0c661cd4
3
  size 146774203