diff --git "a/logs/main/main_log.txt" "b/logs/main/main_log.txt" --- "a/logs/main/main_log.txt" +++ "b/logs/main/main_log.txt" @@ -7325,3 +7325,2638 @@ Setting OMP_NUM_THREADS environment variable for each process to be 1 in default [default7]: iteration 2385/ 3814 | consumed samples: 1221120 | consumed tokens: 2500853760 | elapsed time per iteration (s): 5.45 | learning rate: 3.659E-05 | global batch size: 512 | lm loss: 5.035568E+00 | loss scale: 65536.0 | grad norm: 0.154 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.955 | TFLOPs: 70.96 | [default7]: iteration 2386/ 3814 | consumed samples: 1221632 | consumed tokens: 2501902336 | elapsed time per iteration (s): 5.45 | learning rate: 3.654E-05 | global batch size: 512 | lm loss: 5.016223E+00 | loss scale: 65536.0 | grad norm: 0.165 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.895 | TFLOPs: 70.92 | [default7]: iteration 2387/ 3814 | consumed samples: 1222144 | consumed tokens: 2502950912 | elapsed time per iteration (s): 5.46 | learning rate: 3.650E-05 | global batch size: 512 | lm loss: 5.058045E+00 | loss scale: 65536.0 | grad norm: 0.160 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.742 | TFLOPs: 70.80 | +[default7]: iteration 2388/ 3814 | consumed samples: 1222656 | consumed tokens: 2503999488 | elapsed time per iteration (s): 5.45 | learning rate: 3.646E-05 | global batch size: 512 | lm loss: 5.058588E+00 | loss scale: 65536.0 | grad norm: 0.159 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.935 | TFLOPs: 70.95 | +[default7]: iteration 2389/ 3814 | consumed samples: 1223168 | consumed tokens: 2505048064 | elapsed time per iteration (s): 5.44 | learning rate: 3.641E-05 | global batch size: 512 | lm loss: 5.057855E+00 | loss scale: 65536.0 | grad norm: 0.140 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.114 | TFLOPs: 71.09 | +[default7]: iteration 2390/ 3814 | consumed samples: 1223680 | consumed tokens: 2506096640 | elapsed time per iteration (s): 5.42 | learning rate: 3.637E-05 | global batch size: 512 | lm loss: 5.054258E+00 | loss scale: 65536.0 | grad norm: 0.162 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.426 | TFLOPs: 71.32 | +[default7]: iteration 2391/ 3814 | consumed samples: 1224192 | consumed tokens: 2507145216 | elapsed time per iteration (s): 5.42 | learning rate: 3.633E-05 | global batch size: 512 | lm loss: 5.051129E+00 | loss scale: 65536.0 | grad norm: 0.175 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.422 | TFLOPs: 71.32 | +[default7]: iteration 2392/ 3814 | consumed samples: 1224704 | consumed tokens: 2508193792 | elapsed time per iteration (s): 5.46 | learning rate: 3.628E-05 | global batch size: 512 | lm loss: 5.040317E+00 | loss scale: 65536.0 | grad norm: 0.177 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.703 | TFLOPs: 70.77 | +[default7]: iteration 2393/ 3814 | consumed samples: 1225216 | consumed tokens: 2509242368 | elapsed time per iteration (s): 5.44 | learning rate: 3.624E-05 | global batch size: 512 | lm loss: 5.024004E+00 | loss scale: 65536.0 | grad norm: 0.208 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.167 | TFLOPs: 71.12 | +[default7]: iteration 2394/ 3814 | consumed samples: 1225728 | consumed tokens: 2510290944 | elapsed time per iteration (s): 5.43 | learning rate: 3.619E-05 | global batch size: 512 | lm loss: 5.035820E+00 | loss scale: 65536.0 | grad norm: 0.159 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.212 | TFLOPs: 71.16 | +[default7]: iteration 2395/ 3814 | consumed samples: 1226240 | consumed tokens: 2511339520 | elapsed time per iteration (s): 5.46 | learning rate: 3.615E-05 | global batch size: 512 | lm loss: 5.035029E+00 | loss scale: 65536.0 | grad norm: 0.144 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.760 | TFLOPs: 70.82 | +[default7]: iteration 2396/ 3814 | consumed samples: 1226752 | consumed tokens: 2512388096 | elapsed time per iteration (s): 5.44 | learning rate: 3.611E-05 | global batch size: 512 | lm loss: 5.081193E+00 | loss scale: 65536.0 | grad norm: 0.156 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.059 | TFLOPs: 71.04 | +[default7]: iteration 2397/ 3814 | consumed samples: 1227264 | consumed tokens: 2513436672 | elapsed time per iteration (s): 5.44 | learning rate: 3.606E-05 | global batch size: 512 | lm loss: 5.043208E+00 | loss scale: 65536.0 | grad norm: 0.156 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.054 | TFLOPs: 71.04 | +[default7]: iteration 2398/ 3814 | consumed samples: 1227776 | consumed tokens: 2514485248 | elapsed time per iteration (s): 5.42 | learning rate: 3.602E-05 | global batch size: 512 | lm loss: 5.049439E+00 | loss scale: 65536.0 | grad norm: 0.192 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.496 | TFLOPs: 71.37 | +[default7]: iteration 2399/ 3814 | consumed samples: 1228288 | consumed tokens: 2515533824 | elapsed time per iteration (s): 5.44 | learning rate: 3.598E-05 | global batch size: 512 | lm loss: 5.038866E+00 | loss scale: 65536.0 | grad norm: 0.184 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.101 | TFLOPs: 71.08 | +[default7]: iteration 2400/ 3814 | consumed samples: 1228800 | consumed tokens: 2516582400 | elapsed time per iteration (s): 5.47 | learning rate: 3.593E-05 | global batch size: 512 | lm loss: 5.032152E+00 | loss scale: 65536.0 | grad norm: 0.162 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.583 | TFLOPs: 70.68 | +[default7]: iteration 2401/ 3814 | consumed samples: 1229312 | consumed tokens: 2517630976 | elapsed time per iteration (s): 5.45 | learning rate: 3.589E-05 | global batch size: 512 | lm loss: 5.034581E+00 | loss scale: 65536.0 | grad norm: 0.154 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.025 | TFLOPs: 71.02 | +[default7]: iteration 2402/ 3814 | consumed samples: 1229824 | consumed tokens: 2518679552 | elapsed time per iteration (s): 5.46 | learning rate: 3.585E-05 | global batch size: 512 | lm loss: 5.045053E+00 | loss scale: 65536.0 | grad norm: 0.145 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.740 | TFLOPs: 70.80 | +[default7]: iteration 2403/ 3814 | consumed samples: 1230336 | consumed tokens: 2519728128 | elapsed time per iteration (s): 5.46 | learning rate: 3.580E-05 | global batch size: 512 | lm loss: 5.040744E+00 | loss scale: 65536.0 | grad norm: 0.137 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.780 | TFLOPs: 70.83 | +[default7]: iteration 2404/ 3814 | consumed samples: 1230848 | consumed tokens: 2520776704 | elapsed time per iteration (s): 5.46 | learning rate: 3.576E-05 | global batch size: 512 | lm loss: 5.035909E+00 | loss scale: 65536.0 | grad norm: 0.153 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.844 | TFLOPs: 70.88 | +[default7]: iteration 2405/ 3814 | consumed samples: 1231360 | consumed tokens: 2521825280 | elapsed time per iteration (s): 5.43 | learning rate: 3.571E-05 | global batch size: 512 | lm loss: 5.026386E+00 | loss scale: 65536.0 | grad norm: 0.153 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.250 | TFLOPs: 71.19 | +[default7]: iteration 2406/ 3814 | consumed samples: 1231872 | consumed tokens: 2522873856 | elapsed time per iteration (s): 5.45 | learning rate: 3.567E-05 | global batch size: 512 | lm loss: 5.013245E+00 | loss scale: 65536.0 | grad norm: 0.172 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.960 | TFLOPs: 70.97 | +[default7]: iteration 2407/ 3814 | consumed samples: 1232384 | consumed tokens: 2523922432 | elapsed time per iteration (s): 5.45 | learning rate: 3.563E-05 | global batch size: 512 | lm loss: 5.039238E+00 | loss scale: 65536.0 | grad norm: 0.136 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.915 | TFLOPs: 70.94 | +[default7]: iteration 2408/ 3814 | consumed samples: 1232896 | consumed tokens: 2524971008 | elapsed time per iteration (s): 5.45 | learning rate: 3.558E-05 | global batch size: 512 | lm loss: 5.032116E+00 | loss scale: 65536.0 | grad norm: 0.221 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.871 | TFLOPs: 70.90 | +[default7]: iteration 2409/ 3814 | consumed samples: 1233408 | consumed tokens: 2526019584 | elapsed time per iteration (s): 5.45 | learning rate: 3.554E-05 | global batch size: 512 | lm loss: 5.018023E+00 | loss scale: 65536.0 | grad norm: 0.160 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.005 | TFLOPs: 71.00 | +[default7]: iteration 2410/ 3814 | consumed samples: 1233920 | consumed tokens: 2527068160 | elapsed time per iteration (s): 5.44 | learning rate: 3.550E-05 | global batch size: 512 | lm loss: 5.012621E+00 | loss scale: 65536.0 | grad norm: 0.149 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.061 | TFLOPs: 71.05 | +[default7]: iteration 2411/ 3814 | consumed samples: 1234432 | consumed tokens: 2528116736 | elapsed time per iteration (s): 5.46 | learning rate: 3.545E-05 | global batch size: 512 | lm loss: 5.050149E+00 | loss scale: 65536.0 | grad norm: 0.156 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.789 | TFLOPs: 70.84 | +[default7]: iteration 2412/ 3814 | consumed samples: 1234944 | consumed tokens: 2529165312 | elapsed time per iteration (s): 5.45 | learning rate: 3.541E-05 | global batch size: 512 | lm loss: 5.027584E+00 | loss scale: 65536.0 | grad norm: 0.156 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.937 | TFLOPs: 70.95 | +[default7]: iteration 2413/ 3814 | consumed samples: 1235456 | consumed tokens: 2530213888 | elapsed time per iteration (s): 5.43 | learning rate: 3.537E-05 | global batch size: 512 | lm loss: 5.036427E+00 | loss scale: 65536.0 | grad norm: 0.158 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.304 | TFLOPs: 71.23 | +[default7]: iteration 2414/ 3814 | consumed samples: 1235968 | consumed tokens: 2531262464 | elapsed time per iteration (s): 5.44 | learning rate: 3.532E-05 | global batch size: 512 | lm loss: 5.048415E+00 | loss scale: 65536.0 | grad norm: 0.143 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.049 | TFLOPs: 71.04 | +[default7]: iteration 2415/ 3814 | consumed samples: 1236480 | consumed tokens: 2532311040 | elapsed time per iteration (s): 5.45 | learning rate: 3.528E-05 | global batch size: 512 | lm loss: 5.046865E+00 | loss scale: 65536.0 | grad norm: 0.164 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.984 | TFLOPs: 70.99 | +[default7]: iteration 2416/ 3814 | consumed samples: 1236992 | consumed tokens: 2533359616 | elapsed time per iteration (s): 5.46 | learning rate: 3.524E-05 | global batch size: 512 | lm loss: 5.020695E+00 | loss scale: 65536.0 | grad norm: 0.144 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.805 | TFLOPs: 70.85 | +[default7]: iteration 2417/ 3814 | consumed samples: 1237504 | consumed tokens: 2534408192 | elapsed time per iteration (s): 5.46 | learning rate: 3.519E-05 | global batch size: 512 | lm loss: 5.047297E+00 | loss scale: 65536.0 | grad norm: 0.126 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.838 | TFLOPs: 70.88 | +[default7]: iteration 2418/ 3814 | consumed samples: 1238016 | consumed tokens: 2535456768 | elapsed time per iteration (s): 5.48 | learning rate: 3.515E-05 | global batch size: 512 | lm loss: 5.011358E+00 | loss scale: 65536.0 | grad norm: 0.135 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.510 | TFLOPs: 70.63 | +[default7]: iteration 2419/ 3814 | consumed samples: 1238528 | consumed tokens: 2536505344 | elapsed time per iteration (s): 5.47 | learning rate: 3.511E-05 | global batch size: 512 | lm loss: 5.018526E+00 | loss scale: 65536.0 | grad norm: 0.139 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.662 | TFLOPs: 70.74 | +[default7]: iteration 2420/ 3814 | consumed samples: 1239040 | consumed tokens: 2537553920 | elapsed time per iteration (s): 5.46 | learning rate: 3.506E-05 | global batch size: 512 | lm loss: 5.037924E+00 | loss scale: 65536.0 | grad norm: 0.148 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.722 | TFLOPs: 70.79 | +[default7]: iteration 2421/ 3814 | consumed samples: 1239552 | consumed tokens: 2538602496 | elapsed time per iteration (s): 5.47 | learning rate: 3.502E-05 | global batch size: 512 | lm loss: 5.012375E+00 | loss scale: 65536.0 | grad norm: 0.144 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.677 | TFLOPs: 70.76 | +[default7]: iteration 2422/ 3814 | consumed samples: 1240064 | consumed tokens: 2539651072 | elapsed time per iteration (s): 5.45 | learning rate: 3.498E-05 | global batch size: 512 | lm loss: 5.028972E+00 | loss scale: 65536.0 | grad norm: 0.142 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.924 | TFLOPs: 70.94 | +[default7]: iteration 2423/ 3814 | consumed samples: 1240576 | consumed tokens: 2540699648 | elapsed time per iteration (s): 5.45 | learning rate: 3.493E-05 | global batch size: 512 | lm loss: 5.015338E+00 | loss scale: 65536.0 | grad norm: 0.140 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.029 | TFLOPs: 71.02 | +[default7]: iteration 2424/ 3814 | consumed samples: 1241088 | consumed tokens: 2541748224 | elapsed time per iteration (s): 5.44 | learning rate: 3.489E-05 | global batch size: 512 | lm loss: 5.013425E+00 | loss scale: 65536.0 | grad norm: 0.177 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.128 | TFLOPs: 71.10 | +[default7]: iteration 2425/ 3814 | consumed samples: 1241600 | consumed tokens: 2542796800 | elapsed time per iteration (s): 5.45 | learning rate: 3.485E-05 | global batch size: 512 | lm loss: 5.024074E+00 | loss scale: 65536.0 | grad norm: 0.178 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.939 | TFLOPs: 70.95 | +[default7]: iteration 2426/ 3814 | consumed samples: 1242112 | consumed tokens: 2543845376 | elapsed time per iteration (s): 5.44 | learning rate: 3.480E-05 | global batch size: 512 | lm loss: 5.014858E+00 | loss scale: 65536.0 | grad norm: 0.139 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.149 | TFLOPs: 71.11 | +[default7]: iteration 2427/ 3814 | consumed samples: 1242624 | consumed tokens: 2544893952 | elapsed time per iteration (s): 5.44 | learning rate: 3.476E-05 | global batch size: 512 | lm loss: 5.023463E+00 | loss scale: 65536.0 | grad norm: 0.194 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.125 | TFLOPs: 71.09 | +[default7]: iteration 2428/ 3814 | consumed samples: 1243136 | consumed tokens: 2545942528 | elapsed time per iteration (s): 5.45 | learning rate: 3.472E-05 | global batch size: 512 | lm loss: 5.035069E+00 | loss scale: 65536.0 | grad norm: 0.189 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.930 | TFLOPs: 70.95 | +[default7]: iteration 2429/ 3814 | consumed samples: 1243648 | consumed tokens: 2546991104 | elapsed time per iteration (s): 5.45 | learning rate: 3.467E-05 | global batch size: 512 | lm loss: 5.024330E+00 | loss scale: 65536.0 | grad norm: 0.154 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.904 | TFLOPs: 70.93 | +[default7]: iteration 2430/ 3814 | consumed samples: 1244160 | consumed tokens: 2548039680 | elapsed time per iteration (s): 5.43 | learning rate: 3.463E-05 | global batch size: 512 | lm loss: 5.003408E+00 | loss scale: 65536.0 | grad norm: 0.195 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.216 | TFLOPs: 71.16 | +[default7]: iteration 2431/ 3814 | consumed samples: 1244672 | consumed tokens: 2549088256 | elapsed time per iteration (s): 5.45 | learning rate: 3.459E-05 | global batch size: 512 | lm loss: 5.034333E+00 | loss scale: 65536.0 | grad norm: 0.158 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.902 | TFLOPs: 70.92 | +[default7]: iteration 2432/ 3814 | consumed samples: 1245184 | consumed tokens: 2550136832 | elapsed time per iteration (s): 5.43 | learning rate: 3.454E-05 | global batch size: 512 | lm loss: 5.028014E+00 | loss scale: 65536.0 | grad norm: 0.133 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.218 | TFLOPs: 71.16 | +[default7]: iteration 2433/ 3814 | consumed samples: 1245696 | consumed tokens: 2551185408 | elapsed time per iteration (s): 5.45 | learning rate: 3.450E-05 | global batch size: 512 | lm loss: 5.015933E+00 | loss scale: 65536.0 | grad norm: 0.164 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.019 | TFLOPs: 71.01 | +[default7]: iteration 2434/ 3814 | consumed samples: 1246208 | consumed tokens: 2552233984 | elapsed time per iteration (s): 5.45 | learning rate: 3.446E-05 | global batch size: 512 | lm loss: 5.022120E+00 | loss scale: 65536.0 | grad norm: 0.114 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.960 | TFLOPs: 70.97 | +[default7]: iteration 2435/ 3814 | consumed samples: 1246720 | consumed tokens: 2553282560 | elapsed time per iteration (s): 5.43 | learning rate: 3.441E-05 | global batch size: 512 | lm loss: 5.027170E+00 | loss scale: 65536.0 | grad norm: 0.184 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.314 | TFLOPs: 71.24 | +[default7]: iteration 2436/ 3814 | consumed samples: 1247232 | consumed tokens: 2554331136 | elapsed time per iteration (s): 5.45 | learning rate: 3.437E-05 | global batch size: 512 | lm loss: 5.035166E+00 | loss scale: 65536.0 | grad norm: 0.162 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.933 | TFLOPs: 70.95 | +[default7]: iteration 2437/ 3814 | consumed samples: 1247744 | consumed tokens: 2555379712 | elapsed time per iteration (s): 5.45 | learning rate: 3.433E-05 | global batch size: 512 | lm loss: 5.021499E+00 | loss scale: 65536.0 | grad norm: 0.166 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.994 | TFLOPs: 71.00 | +[default7]: iteration 2438/ 3814 | consumed samples: 1248256 | consumed tokens: 2556428288 | elapsed time per iteration (s): 5.47 | learning rate: 3.428E-05 | global batch size: 512 | lm loss: 5.031065E+00 | loss scale: 65536.0 | grad norm: 0.162 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.649 | TFLOPs: 70.73 | +[default7]: iteration 2439/ 3814 | consumed samples: 1248768 | consumed tokens: 2557476864 | elapsed time per iteration (s): 5.47 | learning rate: 3.424E-05 | global batch size: 512 | lm loss: 5.032934E+00 | loss scale: 65536.0 | grad norm: 0.130 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.594 | TFLOPs: 70.69 | +[default7]: iteration 2440/ 3814 | consumed samples: 1249280 | consumed tokens: 2558525440 | elapsed time per iteration (s): 5.51 | learning rate: 3.420E-05 | global batch size: 512 | lm loss: 5.005402E+00 | loss scale: 65536.0 | grad norm: 0.148 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 92.909 | TFLOPs: 70.18 | +[default7]: iteration 2441/ 3814 | consumed samples: 1249792 | consumed tokens: 2559574016 | elapsed time per iteration (s): 5.51 | learning rate: 3.416E-05 | global batch size: 512 | lm loss: 5.030750E+00 | loss scale: 65536.0 | grad norm: 0.131 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 92.910 | TFLOPs: 70.18 | +[default7]: iteration 2442/ 3814 | consumed samples: 1250304 | consumed tokens: 2560622592 | elapsed time per iteration (s): 5.46 | learning rate: 3.411E-05 | global batch size: 512 | lm loss: 5.015958E+00 | loss scale: 65536.0 | grad norm: 0.127 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.704 | TFLOPs: 70.78 | +[default7]: iteration 2443/ 3814 | consumed samples: 1250816 | consumed tokens: 2561671168 | elapsed time per iteration (s): 5.46 | learning rate: 3.407E-05 | global batch size: 512 | lm loss: 5.040599E+00 | loss scale: 65536.0 | grad norm: 0.135 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.785 | TFLOPs: 70.84 | +[default7]: iteration 2444/ 3814 | consumed samples: 1251328 | consumed tokens: 2562719744 | elapsed time per iteration (s): 5.47 | learning rate: 3.403E-05 | global batch size: 512 | lm loss: 4.999961E+00 | loss scale: 65536.0 | grad norm: 0.115 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.639 | TFLOPs: 70.73 | +[default7]: iteration 2445/ 3814 | consumed samples: 1251840 | consumed tokens: 2563768320 | elapsed time per iteration (s): 5.46 | learning rate: 3.398E-05 | global batch size: 512 | lm loss: 5.023714E+00 | loss scale: 65536.0 | grad norm: 0.114 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.796 | TFLOPs: 70.85 | +[default7]: iteration 2446/ 3814 | consumed samples: 1252352 | consumed tokens: 2564816896 | elapsed time per iteration (s): 5.47 | learning rate: 3.394E-05 | global batch size: 512 | lm loss: 5.014254E+00 | loss scale: 65536.0 | grad norm: 0.127 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.547 | TFLOPs: 70.66 | +[default7]: iteration 2447/ 3814 | consumed samples: 1252864 | consumed tokens: 2565865472 | elapsed time per iteration (s): 5.46 | learning rate: 3.390E-05 | global batch size: 512 | lm loss: 4.999884E+00 | loss scale: 65536.0 | grad norm: 0.126 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.720 | TFLOPs: 70.79 | +[default7]: iteration 2448/ 3814 | consumed samples: 1253376 | consumed tokens: 2566914048 | elapsed time per iteration (s): 5.48 | learning rate: 3.385E-05 | global batch size: 512 | lm loss: 5.023596E+00 | loss scale: 65536.0 | grad norm: 0.130 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.374 | TFLOPs: 70.53 | +[default7]: iteration 2449/ 3814 | consumed samples: 1253888 | consumed tokens: 2567962624 | elapsed time per iteration (s): 5.49 | learning rate: 3.381E-05 | global batch size: 512 | lm loss: 5.025923E+00 | loss scale: 65536.0 | grad norm: 0.143 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.245 | TFLOPs: 70.43 | +[default7]: iteration 2450/ 3814 | consumed samples: 1254400 | consumed tokens: 2569011200 | elapsed time per iteration (s): 5.46 | learning rate: 3.377E-05 | global batch size: 512 | lm loss: 5.017723E+00 | loss scale: 65536.0 | grad norm: 0.144 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.744 | TFLOPs: 70.81 | +[default7]: iteration 2451/ 3814 | consumed samples: 1254912 | consumed tokens: 2570059776 | elapsed time per iteration (s): 5.45 | learning rate: 3.372E-05 | global batch size: 512 | lm loss: 4.992544E+00 | loss scale: 65536.0 | grad norm: 0.153 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.934 | TFLOPs: 70.95 | +[default7]: iteration 2452/ 3814 | consumed samples: 1255424 | consumed tokens: 2571108352 | elapsed time per iteration (s): 5.46 | learning rate: 3.368E-05 | global batch size: 512 | lm loss: 5.032400E+00 | loss scale: 65536.0 | grad norm: 0.138 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.767 | TFLOPs: 70.82 | +[default7]: iteration 2453/ 3814 | consumed samples: 1255936 | consumed tokens: 2572156928 | elapsed time per iteration (s): 5.47 | learning rate: 3.364E-05 | global batch size: 512 | lm loss: 5.036128E+00 | loss scale: 65536.0 | grad norm: 0.124 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.650 | TFLOPs: 70.73 | +[default7]: iteration 2454/ 3814 | consumed samples: 1256448 | consumed tokens: 2573205504 | elapsed time per iteration (s): 5.46 | learning rate: 3.360E-05 | global batch size: 512 | lm loss: 5.038385E+00 | loss scale: 65536.0 | grad norm: 0.132 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.822 | TFLOPs: 70.86 | +[default7]: iteration 2455/ 3814 | consumed samples: 1256960 | consumed tokens: 2574254080 | elapsed time per iteration (s): 5.46 | learning rate: 3.355E-05 | global batch size: 512 | lm loss: 5.058496E+00 | loss scale: 65536.0 | grad norm: 0.140 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.832 | TFLOPs: 70.87 | +[default7]: iteration 2456/ 3814 | consumed samples: 1257472 | consumed tokens: 2575302656 | elapsed time per iteration (s): 5.47 | learning rate: 3.351E-05 | global batch size: 512 | lm loss: 5.021070E+00 | loss scale: 65536.0 | grad norm: 0.135 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.668 | TFLOPs: 70.75 | +[default7]: iteration 2457/ 3814 | consumed samples: 1257984 | consumed tokens: 2576351232 | elapsed time per iteration (s): 5.46 | learning rate: 3.347E-05 | global batch size: 512 | lm loss: 5.044146E+00 | loss scale: 65536.0 | grad norm: 0.128 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.754 | TFLOPs: 70.81 | +[default7]: iteration 2458/ 3814 | consumed samples: 1258496 | consumed tokens: 2577399808 | elapsed time per iteration (s): 5.47 | learning rate: 3.342E-05 | global batch size: 512 | lm loss: 5.013479E+00 | loss scale: 65536.0 | grad norm: 0.118 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.528 | TFLOPs: 70.64 | +[default7]: iteration 2459/ 3814 | consumed samples: 1259008 | consumed tokens: 2578448384 | elapsed time per iteration (s): 5.47 | learning rate: 3.338E-05 | global batch size: 512 | lm loss: 5.017700E+00 | loss scale: 65536.0 | grad norm: 0.141 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.622 | TFLOPs: 70.71 | +[default7]: iteration 2460/ 3814 | consumed samples: 1259520 | consumed tokens: 2579496960 | elapsed time per iteration (s): 5.45 | learning rate: 3.334E-05 | global batch size: 512 | lm loss: 5.003069E+00 | loss scale: 65536.0 | grad norm: 0.140 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.917 | TFLOPs: 70.94 | +[default7]: iteration 2461/ 3814 | consumed samples: 1260032 | consumed tokens: 2580545536 | elapsed time per iteration (s): 5.44 | learning rate: 3.330E-05 | global batch size: 512 | lm loss: 5.013693E+00 | loss scale: 65536.0 | grad norm: 0.136 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.099 | TFLOPs: 71.07 | +[default7]: iteration 2462/ 3814 | consumed samples: 1260544 | consumed tokens: 2581594112 | elapsed time per iteration (s): 5.46 | learning rate: 3.325E-05 | global batch size: 512 | lm loss: 5.029838E+00 | loss scale: 65536.0 | grad norm: 0.136 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.718 | TFLOPs: 70.79 | +[default7]: iteration 2463/ 3814 | consumed samples: 1261056 | consumed tokens: 2582642688 | elapsed time per iteration (s): 5.44 | learning rate: 3.321E-05 | global batch size: 512 | lm loss: 5.010453E+00 | loss scale: 65536.0 | grad norm: 0.143 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.069 | TFLOPs: 71.05 | +[default7]: iteration 2464/ 3814 | consumed samples: 1261568 | consumed tokens: 2583691264 | elapsed time per iteration (s): 5.44 | learning rate: 3.317E-05 | global batch size: 512 | lm loss: 5.017617E+00 | loss scale: 65536.0 | grad norm: 0.152 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.033 | TFLOPs: 71.02 | +[default7]: iteration 2465/ 3814 | consumed samples: 1262080 | consumed tokens: 2584739840 | elapsed time per iteration (s): 5.46 | learning rate: 3.312E-05 | global batch size: 512 | lm loss: 5.035493E+00 | loss scale: 65536.0 | grad norm: 0.133 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.839 | TFLOPs: 70.88 | +[default7]: iteration 2466/ 3814 | consumed samples: 1262592 | consumed tokens: 2585788416 | elapsed time per iteration (s): 5.45 | learning rate: 3.308E-05 | global batch size: 512 | lm loss: 5.018333E+00 | loss scale: 65536.0 | grad norm: 0.144 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.912 | TFLOPs: 70.93 | +[default7]: iteration 2467/ 3814 | consumed samples: 1263104 | consumed tokens: 2586836992 | elapsed time per iteration (s): 5.45 | learning rate: 3.304E-05 | global batch size: 512 | lm loss: 5.008787E+00 | loss scale: 65536.0 | grad norm: 0.113 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.929 | TFLOPs: 70.95 | +[default7]: iteration 2468/ 3814 | consumed samples: 1263616 | consumed tokens: 2587885568 | elapsed time per iteration (s): 5.47 | learning rate: 3.300E-05 | global batch size: 512 | lm loss: 5.015110E+00 | loss scale: 65536.0 | grad norm: 0.122 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.683 | TFLOPs: 70.76 | +[default7]: iteration 2469/ 3814 | consumed samples: 1264128 | consumed tokens: 2588934144 | elapsed time per iteration (s): 5.48 | learning rate: 3.295E-05 | global batch size: 512 | lm loss: 5.007729E+00 | loss scale: 65536.0 | grad norm: 0.121 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.358 | TFLOPs: 70.51 | +[default7]: iteration 2470/ 3814 | consumed samples: 1264640 | consumed tokens: 2589982720 | elapsed time per iteration (s): 5.45 | learning rate: 3.291E-05 | global batch size: 512 | lm loss: 5.041159E+00 | loss scale: 65536.0 | grad norm: 0.152 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.865 | TFLOPs: 70.90 | +[default7]: iteration 2471/ 3814 | consumed samples: 1265152 | consumed tokens: 2591031296 | elapsed time per iteration (s): 5.44 | learning rate: 3.287E-05 | global batch size: 512 | lm loss: 5.019788E+00 | loss scale: 65536.0 | grad norm: 0.159 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.136 | TFLOPs: 71.10 | +[default7]: iteration 2472/ 3814 | consumed samples: 1265664 | consumed tokens: 2592079872 | elapsed time per iteration (s): 5.45 | learning rate: 3.283E-05 | global batch size: 512 | lm loss: 5.025031E+00 | loss scale: 65536.0 | grad norm: 0.138 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.996 | TFLOPs: 71.00 | +[default7]: iteration 2473/ 3814 | consumed samples: 1266176 | consumed tokens: 2593128448 | elapsed time per iteration (s): 5.45 | learning rate: 3.278E-05 | global batch size: 512 | lm loss: 5.009505E+00 | loss scale: 65536.0 | grad norm: 0.113 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.882 | TFLOPs: 70.91 | +[default7]: iteration 2474/ 3814 | consumed samples: 1266688 | consumed tokens: 2594177024 | elapsed time per iteration (s): 5.46 | learning rate: 3.274E-05 | global batch size: 512 | lm loss: 5.010579E+00 | loss scale: 65536.0 | grad norm: 0.126 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.833 | TFLOPs: 70.87 | +[default7]: iteration 2475/ 3814 | consumed samples: 1267200 | consumed tokens: 2595225600 | elapsed time per iteration (s): 5.44 | learning rate: 3.270E-05 | global batch size: 512 | lm loss: 5.009510E+00 | loss scale: 65536.0 | grad norm: 0.126 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.101 | TFLOPs: 71.08 | +[default7]: iteration 2476/ 3814 | consumed samples: 1267712 | consumed tokens: 2596274176 | elapsed time per iteration (s): 5.46 | learning rate: 3.266E-05 | global batch size: 512 | lm loss: 5.009994E+00 | loss scale: 65536.0 | grad norm: 0.145 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.789 | TFLOPs: 70.84 | +[default7]: iteration 2477/ 3814 | consumed samples: 1268224 | consumed tokens: 2597322752 | elapsed time per iteration (s): 5.45 | learning rate: 3.261E-05 | global batch size: 512 | lm loss: 5.009589E+00 | loss scale: 65536.0 | grad norm: 0.143 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.875 | TFLOPs: 70.90 | +[default7]: iteration 2478/ 3814 | consumed samples: 1268736 | consumed tokens: 2598371328 | elapsed time per iteration (s): 5.43 | learning rate: 3.257E-05 | global batch size: 512 | lm loss: 5.008752E+00 | loss scale: 65536.0 | grad norm: 0.136 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.300 | TFLOPs: 71.23 | +[default7]: iteration 2479/ 3814 | consumed samples: 1269248 | consumed tokens: 2599419904 | elapsed time per iteration (s): 5.42 | learning rate: 3.253E-05 | global batch size: 512 | lm loss: 5.013034E+00 | loss scale: 65536.0 | grad norm: 0.125 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.483 | TFLOPs: 71.36 | +[default7]: iteration 2480/ 3814 | consumed samples: 1269760 | consumed tokens: 2600468480 | elapsed time per iteration (s): 5.43 | learning rate: 3.248E-05 | global batch size: 512 | lm loss: 5.016242E+00 | loss scale: 65536.0 | grad norm: 0.135 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.238 | TFLOPs: 71.18 | +[default7]: iteration 2481/ 3814 | consumed samples: 1270272 | consumed tokens: 2601517056 | elapsed time per iteration (s): 5.43 | learning rate: 3.244E-05 | global batch size: 512 | lm loss: 5.006701E+00 | loss scale: 65536.0 | grad norm: 0.156 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.254 | TFLOPs: 71.19 | +[default7]: iteration 2482/ 3814 | consumed samples: 1270784 | consumed tokens: 2602565632 | elapsed time per iteration (s): 5.46 | learning rate: 3.240E-05 | global batch size: 512 | lm loss: 4.979826E+00 | loss scale: 65536.0 | grad norm: 0.182 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.837 | TFLOPs: 70.88 | +[default7]: iteration 2483/ 3814 | consumed samples: 1271296 | consumed tokens: 2603614208 | elapsed time per iteration (s): 5.45 | learning rate: 3.236E-05 | global batch size: 512 | lm loss: 5.022943E+00 | loss scale: 65536.0 | grad norm: 0.204 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.974 | TFLOPs: 70.98 | +[default7]: iteration 2484/ 3814 | consumed samples: 1271808 | consumed tokens: 2604662784 | elapsed time per iteration (s): 5.44 | learning rate: 3.231E-05 | global batch size: 512 | lm loss: 5.030946E+00 | loss scale: 65536.0 | grad norm: 0.189 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.174 | TFLOPs: 71.13 | +[default7]: iteration 2485/ 3814 | consumed samples: 1272320 | consumed tokens: 2605711360 | elapsed time per iteration (s): 5.47 | learning rate: 3.227E-05 | global batch size: 512 | lm loss: 5.014763E+00 | loss scale: 65536.0 | grad norm: 0.165 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.557 | TFLOPs: 70.66 | +[default7]: iteration 2486/ 3814 | consumed samples: 1272832 | consumed tokens: 2606759936 | elapsed time per iteration (s): 5.45 | learning rate: 3.223E-05 | global batch size: 512 | lm loss: 5.012992E+00 | loss scale: 65536.0 | grad norm: 0.191 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.026 | TFLOPs: 71.02 | +[default7]: iteration 2487/ 3814 | consumed samples: 1273344 | consumed tokens: 2607808512 | elapsed time per iteration (s): 5.45 | learning rate: 3.219E-05 | global batch size: 512 | lm loss: 5.014792E+00 | loss scale: 65536.0 | grad norm: 0.119 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.983 | TFLOPs: 70.99 | +[default7]: iteration 2488/ 3814 | consumed samples: 1273856 | consumed tokens: 2608857088 | elapsed time per iteration (s): 5.44 | learning rate: 3.214E-05 | global batch size: 512 | lm loss: 5.011561E+00 | loss scale: 65536.0 | grad norm: 0.162 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.121 | TFLOPs: 71.09 | +[default7]: iteration 2489/ 3814 | consumed samples: 1274368 | consumed tokens: 2609905664 | elapsed time per iteration (s): 5.43 | learning rate: 3.210E-05 | global batch size: 512 | lm loss: 5.000947E+00 | loss scale: 65536.0 | grad norm: 0.183 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.213 | TFLOPs: 71.16 | +[default7]: iteration 2490/ 3814 | consumed samples: 1274880 | consumed tokens: 2610954240 | elapsed time per iteration (s): 5.44 | learning rate: 3.206E-05 | global batch size: 512 | lm loss: 5.019254E+00 | loss scale: 65536.0 | grad norm: 0.134 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.057 | TFLOPs: 71.04 | +[default7]: iteration 2491/ 3814 | consumed samples: 1275392 | consumed tokens: 2612002816 | elapsed time per iteration (s): 5.46 | learning rate: 3.202E-05 | global batch size: 512 | lm loss: 4.981050E+00 | loss scale: 65536.0 | grad norm: 0.165 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.798 | TFLOPs: 70.85 | +[default7]: iteration 2492/ 3814 | consumed samples: 1275904 | consumed tokens: 2613051392 | elapsed time per iteration (s): 5.44 | learning rate: 3.198E-05 | global batch size: 512 | lm loss: 5.012168E+00 | loss scale: 65536.0 | grad norm: 0.154 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.115 | TFLOPs: 71.09 | +[default7]: iteration 2493/ 3814 | consumed samples: 1276416 | consumed tokens: 2614099968 | elapsed time per iteration (s): 5.45 | learning rate: 3.193E-05 | global batch size: 512 | lm loss: 5.036778E+00 | loss scale: 65536.0 | grad norm: 0.144 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.031 | TFLOPs: 71.02 | +[default7]: iteration 2494/ 3814 | consumed samples: 1276928 | consumed tokens: 2615148544 | elapsed time per iteration (s): 5.43 | learning rate: 3.189E-05 | global batch size: 512 | lm loss: 5.017893E+00 | loss scale: 65536.0 | grad norm: 0.166 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.316 | TFLOPs: 71.24 | +[default7]: iteration 2495/ 3814 | consumed samples: 1277440 | consumed tokens: 2616197120 | elapsed time per iteration (s): 5.46 | learning rate: 3.185E-05 | global batch size: 512 | lm loss: 5.020662E+00 | loss scale: 65536.0 | grad norm: 0.151 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.772 | TFLOPs: 70.83 | +[default7]: iteration 2496/ 3814 | consumed samples: 1277952 | consumed tokens: 2617245696 | elapsed time per iteration (s): 5.44 | learning rate: 3.181E-05 | global batch size: 512 | lm loss: 5.014403E+00 | loss scale: 65536.0 | grad norm: 0.115 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.142 | TFLOPs: 71.11 | +[default7]: iteration 2497/ 3814 | consumed samples: 1278464 | consumed tokens: 2618294272 | elapsed time per iteration (s): 5.46 | learning rate: 3.176E-05 | global batch size: 512 | lm loss: 4.998825E+00 | loss scale: 65536.0 | grad norm: 0.130 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.792 | TFLOPs: 70.84 | +[default7]: iteration 2498/ 3814 | consumed samples: 1278976 | consumed tokens: 2619342848 | elapsed time per iteration (s): 5.45 | learning rate: 3.172E-05 | global batch size: 512 | lm loss: 5.016675E+00 | loss scale: 65536.0 | grad norm: 0.127 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.976 | TFLOPs: 70.98 | +[default7]: iteration 2499/ 3814 | consumed samples: 1279488 | consumed tokens: 2620391424 | elapsed time per iteration (s): 5.46 | learning rate: 3.168E-05 | global batch size: 512 | lm loss: 5.008026E+00 | loss scale: 65536.0 | grad norm: 0.138 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.818 | TFLOPs: 70.86 | +[default0]:saving checkpoint at iteration 2500 to /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main +[default0]:[2023-02-16 16:19:45,864] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step2500 is begin to save! +[default0]:[2023-02-16 16:19:45,867] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2500/layer_01-model_00-model_states.pt... +[default1]:[2023-02-16 16:19:45,867] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2500/layer_01-model_01-model_states.pt... +[default7]: iteration 2500/ 3814 | consumed samples: 1280000 | consumed tokens: 2621440000 | elapsed time per iteration (s): 5.45 | learning rate: 3.164E-05 | global batch size: 512 | lm loss: 5.000529E+00 | loss scale: 131072.0 | grad norm: 0.074 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.023 | TFLOPs: 71.02 | +[default0]:[2023-02-16 16:19:46,031] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2500/layer_01-model_00-model_states.pt. +[default0]:[2023-02-16 16:19:46,032] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2500/layer_04-model_00-model_states.pt... +[default1]:[2023-02-16 16:19:46,032] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2500/layer_01-model_01-model_states.pt. +[default1]:[2023-02-16 16:19:46,032] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2500/layer_04-model_01-model_states.pt... +[default1]:[2023-02-16 16:19:46,113] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2500/layer_04-model_01-model_states.pt. +[default0]:[2023-02-16 16:19:46,114] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2500/layer_04-model_00-model_states.pt. +[default0]:[2023-02-16 16:19:46,115] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2500/layer_05-model_00-model_states.pt... +[default0]:[2023-02-16 16:19:46,180] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2500/layer_05-model_00-model_states.pt. +[default0]:[2023-02-16 16:19:46,181] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2500/layer_06-model_00-model_states.pt... +[default1]:[2023-02-16 16:19:46,114] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2500/layer_05-model_01-model_states.pt... +[default1]:[2023-02-16 16:19:46,176] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2500/layer_05-model_01-model_states.pt. +[default1]:[2023-02-16 16:19:46,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2500/layer_06-model_01-model_states.pt... +[default0]:[2023-02-16 16:19:46,242] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2500/layer_06-model_00-model_states.pt. +[default0]:[2023-02-16 16:19:46,243] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2500/layer_07-model_00-model_states.pt... +[default0]:[2023-02-16 16:19:46,300] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2500/layer_07-model_00-model_states.pt. +[default0]:[2023-02-16 16:19:46,300] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2500/layer_08-model_00-model_states.pt... +[default1]:[2023-02-16 16:19:46,234] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2500/layer_06-model_01-model_states.pt. +[default1]:[2023-02-16 16:19:46,235] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2500/layer_07-model_01-model_states.pt... +[default1]:[2023-02-16 16:19:46,291] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2500/layer_07-model_01-model_states.pt. +[default1]:[2023-02-16 16:19:46,291] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2500/layer_08-model_01-model_states.pt... +[default0]:[2023-02-16 16:19:46,361] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2500/layer_08-model_00-model_states.pt. +[default0]:[2023-02-16 16:19:46,361] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2500/layer_09-model_00-model_states.pt... +[default1]:[2023-02-16 16:19:46,352] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2500/layer_08-model_01-model_states.pt. +[default1]:[2023-02-16 16:19:46,352] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2500/layer_09-model_01-model_states.pt... +[default1]:[2023-02-16 16:19:46,408] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2500/layer_09-model_01-model_states.pt. +[default1]:[2023-02-16 16:19:46,409] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2500/layer_10-model_01-model_states.pt... +[default0]:[2023-02-16 16:19:46,418] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2500/layer_09-model_00-model_states.pt. +[default0]:[2023-02-16 16:19:46,418] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2500/layer_10-model_00-model_states.pt... +[default0]:[2023-02-16 16:19:46,475] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2500/layer_10-model_00-model_states.pt. +[default0]:[2023-02-16 16:19:46,476] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2500/layer_11-model_00-model_states.pt... +[default1]:[2023-02-16 16:19:46,466] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2500/layer_10-model_01-model_states.pt. +[default1]:[2023-02-16 16:19:46,467] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2500/layer_11-model_01-model_states.pt... +[default0]:[2023-02-16 16:19:46,531] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2500/layer_11-model_00-model_states.pt. +[default0]:[2023-02-16 16:19:46,532] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2500/layer_12-model_00-model_states.pt... +[default0]:[2023-02-16 16:19:46,588] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2500/layer_12-model_00-model_states.pt. +[default0]:[2023-02-16 16:19:46,588] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2500/layer_13-model_00-model_states.pt... +[default1]:[2023-02-16 16:19:46,522] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2500/layer_11-model_01-model_states.pt. +[default1]:[2023-02-16 16:19:46,522] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2500/layer_12-model_01-model_states.pt... +[default1]:[2023-02-16 16:19:46,578] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2500/layer_12-model_01-model_states.pt. +[default1]:[2023-02-16 16:19:46,579] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2500/layer_13-model_01-model_states.pt... +[default0]:[2023-02-16 16:19:46,649] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2500/layer_13-model_00-model_states.pt. +[default0]:[2023-02-16 16:19:46,649] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2500/layer_14-model_00-model_states.pt... +[default0]:[2023-02-16 16:19:46,705] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2500/layer_14-model_00-model_states.pt. +[default0]:[2023-02-16 16:19:46,706] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2500/layer_15-model_00-model_states.pt... +[default1]:[2023-02-16 16:19:46,637] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2500/layer_13-model_01-model_states.pt. +[default1]:[2023-02-16 16:19:46,638] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2500/layer_14-model_01-model_states.pt... +[default1]:[2023-02-16 16:19:46,692] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2500/layer_14-model_01-model_states.pt. +[default1]:[2023-02-16 16:19:46,693] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2500/layer_15-model_01-model_states.pt... +[default0]:[2023-02-16 16:19:46,772] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2500/layer_15-model_00-model_states.pt. +[default0]:[2023-02-16 16:19:46,773] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2500/layer_16-model_00-model_states.pt... +[default1]:[2023-02-16 16:19:46,757] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2500/layer_15-model_01-model_states.pt. +[default1]:[2023-02-16 16:19:46,757] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2500/layer_16-model_01-model_states.pt... +[default0]:[2023-02-16 16:19:46,843] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2500/layer_16-model_00-model_states.pt. +[default0]:[2023-02-16 16:19:46,843] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2500/layer_17-model_00-model_states.pt... +[default1]:[2023-02-16 16:19:46,815] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2500/layer_16-model_01-model_states.pt. +[default1]:[2023-02-16 16:19:46,816] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2500/layer_17-model_01-model_states.pt... +[default1]:[2023-02-16 16:19:46,893] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2500/layer_17-model_01-model_states.pt. +[default1]:[2023-02-16 16:19:46,893] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2500/layer_18-model_01-model_states.pt... +[default0]:[2023-02-16 16:19:46,909] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2500/layer_17-model_00-model_states.pt. +[default0]:[2023-02-16 16:19:46,910] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2500/layer_18-model_00-model_states.pt... +[default0]:[2023-02-16 16:19:46,982] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2500/layer_18-model_00-model_states.pt. +[default0]:[2023-02-16 16:19:46,983] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2500/layer_19-model_00-model_states.pt... +[default1]:[2023-02-16 16:19:46,954] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2500/layer_18-model_01-model_states.pt. +[default1]:[2023-02-16 16:19:46,954] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2500/layer_19-model_01-model_states.pt... +[default0]:[2023-02-16 16:19:47,047] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2500/layer_19-model_00-model_states.pt. +[default0]:[2023-02-16 16:19:47,047] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2500/layer_20-model_00-model_states.pt... +[default0]:[2023-02-16 16:19:47,102] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2500/layer_20-model_00-model_states.pt. +[default0]:[2023-02-16 16:19:47,103] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2500/layer_21-model_00-model_states.pt... +[default1]:[2023-02-16 16:19:47,022] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2500/layer_19-model_01-model_states.pt. +[default1]:[2023-02-16 16:19:47,022] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2500/layer_20-model_01-model_states.pt... +[default1]:[2023-02-16 16:19:47,083] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2500/layer_20-model_01-model_states.pt. +[default1]:[2023-02-16 16:19:47,084] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2500/layer_21-model_01-model_states.pt... +[default0]:[2023-02-16 16:19:47,163] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2500/layer_21-model_00-model_states.pt. +[default0]:[2023-02-16 16:19:47,163] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2500/layer_22-model_00-model_states.pt... +[default1]:[2023-02-16 16:19:47,142] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2500/layer_21-model_01-model_states.pt. +[default1]:[2023-02-16 16:19:47,142] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2500/layer_22-model_01-model_states.pt... +[default1]:[2023-02-16 16:19:47,199] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2500/layer_22-model_01-model_states.pt. +[default1]:[2023-02-16 16:19:47,199] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2500/layer_23-model_01-model_states.pt... +[default0]:[2023-02-16 16:19:47,221] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2500/layer_22-model_00-model_states.pt. +[default0]:[2023-02-16 16:19:47,222] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2500/layer_23-model_00-model_states.pt... +[default0]:[2023-02-16 16:19:47,281] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2500/layer_23-model_00-model_states.pt. +[default0]:[2023-02-16 16:19:47,281] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2500/layer_24-model_00-model_states.pt... +[default1]:[2023-02-16 16:19:47,255] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2500/layer_23-model_01-model_states.pt. +[default1]:[2023-02-16 16:19:47,256] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2500/layer_24-model_01-model_states.pt... +[default1]:[2023-02-16 16:19:47,315] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2500/layer_24-model_01-model_states.pt. +[default0]:[2023-02-16 16:19:47,337] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2500/layer_24-model_00-model_states.pt. +[default0]:[2023-02-16 16:19:47,337] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2500/layer_25-model_00-model_states.pt... +[default0]:[2023-02-16 16:19:47,393] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2500/layer_25-model_00-model_states.pt. +[default0]:[2023-02-16 16:19:47,394] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2500/layer_26-model_00-model_states.pt... +[default1]:[2023-02-16 16:19:47,315] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2500/layer_25-model_01-model_states.pt... +[default1]:[2023-02-16 16:19:47,369] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2500/layer_25-model_01-model_states.pt. +[default1]:[2023-02-16 16:19:47,370] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2500/layer_26-model_01-model_states.pt... +[default0]:[2023-02-16 16:19:47,454] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2500/layer_26-model_00-model_states.pt. +[default0]:[2023-02-16 16:19:47,454] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2500/layer_27-model_00-model_states.pt... +[default1]:[2023-02-16 16:19:47,425] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2500/layer_26-model_01-model_states.pt. +[default1]:[2023-02-16 16:19:47,426] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2500/layer_27-model_01-model_states.pt... +[default1]:[2023-02-16 16:19:47,481] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2500/layer_27-model_01-model_states.pt. +[default1]:[2023-02-16 16:19:47,482] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2500/layer_28-model_01-model_states.pt... +[default0]:[2023-02-16 16:19:47,510] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2500/layer_27-model_00-model_states.pt. +[default0]:[2023-02-16 16:19:47,511] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2500/layer_28-model_00-model_states.pt... +[default0]:[2023-02-16 16:19:47,567] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2500/layer_28-model_00-model_states.pt. +[default0]:[2023-02-16 16:19:47,568] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2500/layer_29-model_00-model_states.pt... +[default1]:[2023-02-16 16:19:47,537] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2500/layer_28-model_01-model_states.pt. +[default1]:[2023-02-16 16:19:47,537] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2500/layer_29-model_01-model_states.pt... +[default1]:[2023-02-16 16:19:47,592] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2500/layer_29-model_01-model_states.pt. +[default1]:[2023-02-16 16:19:47,593] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2500/layer_30-model_01-model_states.pt... +[default0]:[2023-02-16 16:19:47,628] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2500/layer_29-model_00-model_states.pt. +[default0]:[2023-02-16 16:19:47,629] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2500/layer_30-model_00-model_states.pt... +[default0]:[2023-02-16 16:19:47,689] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2500/layer_30-model_00-model_states.pt. +[default0]:[2023-02-16 16:19:47,690] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2500/layer_31-model_00-model_states.pt... +[default1]:[2023-02-16 16:19:47,653] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2500/layer_30-model_01-model_states.pt. +[default1]:[2023-02-16 16:19:47,653] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2500/layer_31-model_01-model_states.pt... +[default1]:[2023-02-16 16:19:47,714] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2500/layer_31-model_01-model_states.pt. +[default1]:[2023-02-16 16:19:47,715] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2500/layer_32-model_01-model_states.pt... +[default0]:[2023-02-16 16:19:47,749] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2500/layer_31-model_00-model_states.pt. +[default0]:[2023-02-16 16:19:47,750] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2500/layer_32-model_00-model_states.pt... +[default0]:[2023-02-16 16:19:47,808] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2500/layer_32-model_00-model_states.pt. +[default0]:[2023-02-16 16:19:47,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2500/layer_33-model_00-model_states.pt... +[default1]:[2023-02-16 16:19:47,773] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2500/layer_32-model_01-model_states.pt. +[default1]:[2023-02-16 16:19:47,774] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2500/layer_33-model_01-model_states.pt... +[default0]:[2023-02-16 16:19:47,867] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2500/layer_33-model_00-model_states.pt. +[default0]:[2023-02-16 16:19:47,867] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2500/layer_34-model_00-model_states.pt... +[default1]:[2023-02-16 16:19:47,832] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2500/layer_33-model_01-model_states.pt. +[default1]:[2023-02-16 16:19:47,833] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2500/layer_34-model_01-model_states.pt... +[default1]:[2023-02-16 16:19:47,889] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2500/layer_34-model_01-model_states.pt. +[default1]:[2023-02-16 16:19:47,889] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2500/layer_35-model_01-model_states.pt... +[default0]:[2023-02-16 16:19:47,923] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2500/layer_34-model_00-model_states.pt. +[default0]:[2023-02-16 16:19:47,924] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2500/layer_35-model_00-model_states.pt... +[default0]:[2023-02-16 16:19:47,981] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2500/layer_35-model_00-model_states.pt. +[default0]:[2023-02-16 16:19:47,981] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2500/layer_37-model_00-model_states.pt... +[default6]:[2023-02-16 16:19:48,018] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2500/zero_pp_rank_19_mp_rank_00_optim_states.pt... +[default7]:[2023-02-16 16:19:48,018] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2500/zero_pp_rank_27_mp_rank_01_optim_states.pt... +[default5]:[2023-02-16 16:19:48,018] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2500/zero_pp_rank_10_mp_rank_01_optim_states.pt... +[default2]:[2023-02-16 16:19:48,018] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2500/zero_pp_rank_13_mp_rank_00_optim_states.pt... +[default6]:[2023-02-16 16:19:48,019] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2500/zero_pp_rank_23_mp_rank_00_optim_states.pt... +[default0]:[2023-02-16 16:19:48,018] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2500/zero_pp_rank_16_mp_rank_00_optim_states.pt... +[default4]:[2023-02-16 16:19:48,018] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2500/zero_pp_rank_18_mp_rank_00_optim_states.pt... +[default1]:[2023-02-16 16:19:48,018] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2500/zero_pp_rank_12_mp_rank_01_optim_states.pt... +[default7]:[2023-02-16 16:19:48,018] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2500/zero_pp_rank_31_mp_rank_01_optim_states.pt... +[default3]:[2023-02-16 16:19:48,018] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2500/zero_pp_rank_5_mp_rank_01_optim_states.pt... +[default3]:[2023-02-16 16:19:48,018] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2500/zero_pp_rank_13_mp_rank_01_optim_states.pt... +[default4]:[2023-02-16 16:19:48,019] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2500/zero_pp_rank_22_mp_rank_00_optim_states.pt... +[default4]:[2023-02-16 16:19:48,018] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2500/zero_pp_rank_26_mp_rank_00_optim_states.pt... +[default0]:[2023-02-16 16:19:47,982] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2500/layer_37-model_00-model_states.pt. +[default0]:[2023-02-16 16:19:47,983] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2500/mp_rank_00_model_states.pt +[default0]:[2023-02-16 16:19:47,983] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2500/mp_rank_00_model_states.pt... +[default0]:[2023-02-16 16:19:47,985] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2500/mp_rank_00_model_states.pt. +[default1]:[2023-02-16 16:19:47,945] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2500/layer_35-model_01-model_states.pt. +[default1]:[2023-02-16 16:19:47,946] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2500/layer_37-model_01-model_states.pt... +[default1]:[2023-02-16 16:19:47,947] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2500/layer_37-model_01-model_states.pt. +[default1]:[2023-02-16 16:19:47,948] [INFO] [logging.py:68:log_dist] [Rank 1] Saving model checkpoint: /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2500/mp_rank_01_model_states.pt +[default1]:[2023-02-16 16:19:47,948] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2500/mp_rank_01_model_states.pt... +[default1]:[2023-02-16 16:19:47,950] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2500/mp_rank_01_model_states.pt. +[default6]:[2023-02-16 16:19:48,018] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2500/zero_pp_rank_3_mp_rank_00_optim_states.pt... +[default7]:[2023-02-16 16:19:48,018] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2500/zero_pp_rank_3_mp_rank_01_optim_states.pt... +[default7]:[2023-02-16 16:19:48,019] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2500/zero_pp_rank_19_mp_rank_01_optim_states.pt... +[default3]:[2023-02-16 16:19:48,018] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2500/zero_pp_rank_29_mp_rank_01_optim_states.pt... +[default0]:[2023-02-16 16:19:48,018] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2500/zero_pp_rank_12_mp_rank_00_optim_states.pt... +[default1]:[2023-02-16 16:19:48,018] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2500/zero_pp_rank_8_mp_rank_01_optim_states.pt... +[default2]:[2023-02-16 16:19:48,018] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2500/zero_pp_rank_29_mp_rank_00_optim_states.pt... +[default5]:[2023-02-16 16:19:48,018] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2500/zero_pp_rank_26_mp_rank_01_optim_states.pt... +[default6]:[2023-02-16 16:19:48,018] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2500/zero_pp_rank_15_mp_rank_00_optim_states.pt... +[default1]:[2023-02-16 16:19:48,018] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2500/zero_pp_rank_4_mp_rank_01_optim_states.pt... +[default0]:[2023-02-16 16:19:48,018] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2500/zero_pp_rank_8_mp_rank_00_optim_states.pt... +[default6]:[2023-02-16 16:19:48,018] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2500/zero_pp_rank_27_mp_rank_00_optim_states.pt... +[default4]:[2023-02-16 16:19:48,018] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2500/zero_pp_rank_10_mp_rank_00_optim_states.pt... +[default3]:[2023-02-16 16:19:48,018] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2500/zero_pp_rank_17_mp_rank_01_optim_states.pt... +[default0]:[2023-02-16 16:19:48,018] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2500/zero_pp_rank_28_mp_rank_00_optim_states.pt... +[default0]:[2023-02-16 16:19:48,019] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2500/zero_pp_rank_20_mp_rank_00_optim_states.pt... +[default5]:[2023-02-16 16:19:48,018] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2500/zero_pp_rank_14_mp_rank_01_optim_states.pt... +[default1]:[2023-02-16 16:19:48,018] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2500/zero_pp_rank_28_mp_rank_01_optim_states.pt... +[default6]:[2023-02-16 16:19:48,018] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2500/zero_pp_rank_7_mp_rank_00_optim_states.pt... +[default3]:[2023-02-16 16:19:48,018] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2500/zero_pp_rank_25_mp_rank_01_optim_states.pt... +[default7]:[2023-02-16 16:19:48,018] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2500/zero_pp_rank_11_mp_rank_01_optim_states.pt... +[default0]:[2023-02-16 16:19:48,018] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2500/zero_pp_rank_24_mp_rank_00_optim_states.pt... +[default1]:[2023-02-16 16:19:48,018] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2500/zero_pp_rank_24_mp_rank_01_optim_states.pt... +[default2]:[2023-02-16 16:19:48,018] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2500/zero_pp_rank_5_mp_rank_00_optim_states.pt... +[default1]:[2023-02-16 16:19:48,018] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2500/zero_pp_rank_16_mp_rank_01_optim_states.pt... +[default7]:[2023-02-16 16:19:48,018] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2500/zero_pp_rank_15_mp_rank_01_optim_states.pt... +[default7]:[2023-02-16 16:19:48,018] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2500/zero_pp_rank_7_mp_rank_01_optim_states.pt... +[default3]:[2023-02-16 16:19:48,018] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2500/zero_pp_rank_1_mp_rank_01_optim_states.pt... +[default2]:[2023-02-16 16:19:48,019] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2500/zero_pp_rank_1_mp_rank_00_optim_states.pt... +[default1]:[2023-02-16 16:19:48,019] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2500/zero_pp_rank_20_mp_rank_01_optim_states.pt... +[default6]:[2023-02-16 16:19:48,018] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2500/zero_pp_rank_11_mp_rank_00_optim_states.pt... +[default5]:[2023-02-16 16:19:48,018] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2500/zero_pp_rank_18_mp_rank_01_optim_states.pt... +[default7]:[2023-02-16 16:19:48,019] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2500/zero_pp_rank_23_mp_rank_01_optim_states.pt... +[default2]:[2023-02-16 16:19:48,018] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2500/zero_pp_rank_17_mp_rank_00_optim_states.pt... +[default0]:[2023-02-16 16:19:48,018] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2500/zero_pp_rank_4_mp_rank_00_optim_states.pt... +[default4]:[2023-02-16 16:19:48,018] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2500/zero_pp_rank_14_mp_rank_00_optim_states.pt... +[default5]:[2023-02-16 16:19:48,019] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2500/zero_pp_rank_22_mp_rank_01_optim_states.pt... +[default4]:[2023-02-16 16:19:48,018] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2500/zero_pp_rank_30_mp_rank_00_optim_states.pt... +[default4]:[2023-02-16 16:19:48,018] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2500/zero_pp_rank_2_mp_rank_00_optim_states.pt... +[default0]:[2023-02-16 16:19:48,018] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2500/zero_pp_rank_0_mp_rank_00_optim_states.pt... +[default5]:[2023-02-16 16:19:48,018] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2500/zero_pp_rank_30_mp_rank_01_optim_states.pt... +[default4]:[2023-02-16 16:19:48,018] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2500/zero_pp_rank_6_mp_rank_00_optim_states.pt... +[default2]:[2023-02-16 16:19:48,018] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2500/zero_pp_rank_9_mp_rank_00_optim_states.pt... +[default3]:[2023-02-16 16:19:48,018] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2500/zero_pp_rank_9_mp_rank_01_optim_states.pt... +[default2]:[2023-02-16 16:19:48,019] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2500/zero_pp_rank_21_mp_rank_00_optim_states.pt... +[default3]:[2023-02-16 16:19:48,019] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2500/zero_pp_rank_21_mp_rank_01_optim_states.pt... +[default5]:[2023-02-16 16:19:48,018] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2500/zero_pp_rank_2_mp_rank_01_optim_states.pt... +[default1]:[2023-02-16 16:19:48,018] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2500/zero_pp_rank_0_mp_rank_01_optim_states.pt... +[default5]:[2023-02-16 16:19:48,018] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2500/zero_pp_rank_6_mp_rank_01_optim_states.pt... +[default6]:[2023-02-16 16:19:48,018] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2500/zero_pp_rank_31_mp_rank_00_optim_states.pt... +[default2]:[2023-02-16 16:19:48,018] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2500/zero_pp_rank_25_mp_rank_00_optim_states.pt... +[default4]:[2023-02-16 16:19:48,504] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2500/zero_pp_rank_10_mp_rank_00_optim_states.pt. +[default4]:[2023-02-16 16:19:48,504] [INFO] [engine.py:3199:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2500/zero_pp_rank_10_mp_rank_00_optim_states.pt +[default4]:[2023-02-16 16:19:48,504] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2500 is ready now! +[default3]:[2023-02-16 16:19:48,529] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2500/zero_pp_rank_17_mp_rank_01_optim_states.pt. +[default3]:[2023-02-16 16:19:48,529] [INFO] [engine.py:3199:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2500/zero_pp_rank_17_mp_rank_01_optim_states.pt +[default3]:[2023-02-16 16:19:48,529] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2500 is ready now! +[default2]:[2023-02-16 16:19:48,506] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2500/zero_pp_rank_1_mp_rank_00_optim_states.pt. +[default2]:[2023-02-16 16:19:48,506] [INFO] [engine.py:3199:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2500/zero_pp_rank_1_mp_rank_00_optim_states.pt +[default2]:[2023-02-16 16:19:48,507] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2500 is ready now! +[default6]:[2023-02-16 16:19:48,561] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2500/zero_pp_rank_11_mp_rank_00_optim_states.pt. +[default6]:[2023-02-16 16:19:48,561] [INFO] [engine.py:3199:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2500/zero_pp_rank_11_mp_rank_00_optim_states.pt +[default6]:[2023-02-16 16:19:48,562] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2500 is ready now! +[default4]:[2023-02-16 16:19:48,566] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2500/zero_pp_rank_30_mp_rank_00_optim_states.pt. +[default4]:[2023-02-16 16:19:48,567] [INFO] [engine.py:3199:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2500/zero_pp_rank_30_mp_rank_00_optim_states.pt +[default4]:[2023-02-16 16:19:48,567] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2500 is ready now! +[default0]:[2023-02-16 16:19:48,554] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2500/zero_pp_rank_0_mp_rank_00_optim_states.pt. +[default0]:[2023-02-16 16:19:48,556] [INFO] [engine.py:3199:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2500/zero_pp_rank_0_mp_rank_00_optim_states.pt +[default0]:[2023-02-16 16:19:48,556] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2500 is ready now! +[default3]:[2023-02-16 16:19:48,544] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2500/zero_pp_rank_9_mp_rank_01_optim_states.pt. +[default3]:[2023-02-16 16:19:48,544] [INFO] [engine.py:3199:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2500/zero_pp_rank_9_mp_rank_01_optim_states.pt +[default3]:[2023-02-16 16:19:48,544] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2500 is ready now! +[default7]:[2023-02-16 16:19:48,571] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2500/zero_pp_rank_27_mp_rank_01_optim_states.pt. +[default7]:[2023-02-16 16:19:48,572] [INFO] [engine.py:3199:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2500/zero_pp_rank_27_mp_rank_01_optim_states.pt +[default7]:[2023-02-16 16:19:48,572] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2500 is ready now! +[default1]:[2023-02-16 16:19:48,596] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2500/zero_pp_rank_12_mp_rank_01_optim_states.pt. +[default1]:[2023-02-16 16:19:48,597] [INFO] [engine.py:3199:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2500/zero_pp_rank_12_mp_rank_01_optim_states.pt +[default1]:[2023-02-16 16:19:48,597] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2500 is ready now! +[default3]:[2023-02-16 16:19:48,619] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2500/zero_pp_rank_5_mp_rank_01_optim_states.pt. +[default3]:[2023-02-16 16:19:48,619] [INFO] [engine.py:3199:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2500/zero_pp_rank_5_mp_rank_01_optim_states.pt +[default3]:[2023-02-16 16:19:48,619] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2500 is ready now! +[default3]:[2023-02-16 16:19:48,649] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2500/zero_pp_rank_1_mp_rank_01_optim_states.pt. +[default3]:[2023-02-16 16:19:48,649] [INFO] [engine.py:3199:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2500/zero_pp_rank_1_mp_rank_01_optim_states.pt +[default3]:[2023-02-16 16:19:48,649] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2500 is ready now! +[default3]:[2023-02-16 16:19:48,589] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2500/zero_pp_rank_29_mp_rank_01_optim_states.pt. +[default3]:[2023-02-16 16:19:48,590] [INFO] [engine.py:3199:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2500/zero_pp_rank_29_mp_rank_01_optim_states.pt +[default3]:[2023-02-16 16:19:48,590] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2500 is ready now! +[default1]:[2023-02-16 16:19:48,632] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2500/zero_pp_rank_8_mp_rank_01_optim_states.pt. +[default1]:[2023-02-16 16:19:48,632] [INFO] [engine.py:3199:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2500/zero_pp_rank_8_mp_rank_01_optim_states.pt +[default1]:[2023-02-16 16:19:48,632] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2500 is ready now! +[default0]:[2023-02-16 16:19:48,651] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2500/zero_pp_rank_8_mp_rank_00_optim_states.pt. +[default0]:[2023-02-16 16:19:48,651] [INFO] [engine.py:3199:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2500/zero_pp_rank_8_mp_rank_00_optim_states.pt +[default0]:[2023-02-16 16:19:48,651] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2500 is ready now! +[default0]:[2023-02-16 16:19:48,676] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2500/zero_pp_rank_20_mp_rank_00_optim_states.pt. +[default0]:[2023-02-16 16:19:48,676] [INFO] [engine.py:3199:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2500/zero_pp_rank_20_mp_rank_00_optim_states.pt +[default0]:[2023-02-16 16:19:48,677] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2500 is ready now! +[default5]:[2023-02-16 16:19:48,646] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2500/zero_pp_rank_14_mp_rank_01_optim_states.pt. +[default5]:[2023-02-16 16:19:48,647] [INFO] [engine.py:3199:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2500/zero_pp_rank_14_mp_rank_01_optim_states.pt +[default5]:[2023-02-16 16:19:48,647] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2500 is ready now! +[default6]:[2023-02-16 16:19:48,651] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2500/zero_pp_rank_7_mp_rank_00_optim_states.pt. +[default6]:[2023-02-16 16:19:48,651] [INFO] [engine.py:3199:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2500/zero_pp_rank_7_mp_rank_00_optim_states.pt +[default6]:[2023-02-16 16:19:48,652] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2500 is ready now! +[default7]:[2023-02-16 16:19:48,654] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2500/zero_pp_rank_11_mp_rank_01_optim_states.pt. +[default7]:[2023-02-16 16:19:48,655] [INFO] [engine.py:3199:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2500/zero_pp_rank_11_mp_rank_01_optim_states.pt +[default7]:[2023-02-16 16:19:48,655] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2500 is ready now! +[default2]:[2023-02-16 16:19:48,650] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2500/zero_pp_rank_5_mp_rank_00_optim_states.pt. +[default2]:[2023-02-16 16:19:48,650] [INFO] [engine.py:3199:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2500/zero_pp_rank_5_mp_rank_00_optim_states.pt +[default2]:[2023-02-16 16:19:48,650] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2500 is ready now! +[default1]:[2023-02-16 16:19:48,663] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2500/zero_pp_rank_16_mp_rank_01_optim_states.pt. +[default1]:[2023-02-16 16:19:48,663] [INFO] [engine.py:3199:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2500/zero_pp_rank_16_mp_rank_01_optim_states.pt +[default1]:[2023-02-16 16:19:48,663] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2500 is ready now! +[default7]:[2023-02-16 16:19:48,687] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2500/zero_pp_rank_15_mp_rank_01_optim_states.pt. +[default7]:[2023-02-16 16:19:48,687] [INFO] [engine.py:3199:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2500/zero_pp_rank_15_mp_rank_01_optim_states.pt +[default7]:[2023-02-16 16:19:48,687] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2500 is ready now! +[default7]:[2023-02-16 16:19:48,673] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2500/zero_pp_rank_7_mp_rank_01_optim_states.pt. +[default7]:[2023-02-16 16:19:48,673] [INFO] [engine.py:3199:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2500/zero_pp_rank_7_mp_rank_01_optim_states.pt +[default7]:[2023-02-16 16:19:48,673] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2500 is ready now! +[default4]:[2023-02-16 16:19:48,671] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2500/zero_pp_rank_14_mp_rank_00_optim_states.pt. +[default4]:[2023-02-16 16:19:48,671] [INFO] [engine.py:3199:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2500/zero_pp_rank_14_mp_rank_00_optim_states.pt +[default4]:[2023-02-16 16:19:48,671] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2500 is ready now! +[default4]:[2023-02-16 16:19:48,673] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2500/zero_pp_rank_2_mp_rank_00_optim_states.pt. +[default4]:[2023-02-16 16:19:48,673] [INFO] [engine.py:3199:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2500/zero_pp_rank_2_mp_rank_00_optim_states.pt +[default4]:[2023-02-16 16:19:48,674] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2500 is ready now! +[default5]:[2023-02-16 16:19:48,693] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2500/zero_pp_rank_30_mp_rank_01_optim_states.pt. +[default5]:[2023-02-16 16:19:48,693] [INFO] [engine.py:3199:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2500/zero_pp_rank_30_mp_rank_01_optim_states.pt +[default5]:[2023-02-16 16:19:48,693] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2500 is ready now! +[default4]:[2023-02-16 16:19:48,688] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2500/zero_pp_rank_6_mp_rank_00_optim_states.pt. +[default4]:[2023-02-16 16:19:48,689] [INFO] [engine.py:3199:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2500/zero_pp_rank_6_mp_rank_00_optim_states.pt +[default4]:[2023-02-16 16:19:48,689] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2500 is ready now! +[default2]:[2023-02-16 16:19:48,667] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2500/zero_pp_rank_9_mp_rank_00_optim_states.pt. +[default2]:[2023-02-16 16:19:48,668] [INFO] [engine.py:3199:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2500/zero_pp_rank_9_mp_rank_00_optim_states.pt +[default2]:[2023-02-16 16:19:48,668] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2500 is ready now! +[default3]:[2023-02-16 16:19:48,630] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2500/zero_pp_rank_21_mp_rank_01_optim_states.pt. +[default3]:[2023-02-16 16:19:48,630] [INFO] [engine.py:3199:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2500/zero_pp_rank_21_mp_rank_01_optim_states.pt +[default3]:[2023-02-16 16:19:48,630] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2500 is ready now! +[default5]:[2023-02-16 16:19:48,642] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2500/zero_pp_rank_2_mp_rank_01_optim_states.pt. +[default5]:[2023-02-16 16:19:48,643] [INFO] [engine.py:3199:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2500/zero_pp_rank_2_mp_rank_01_optim_states.pt +[default5]:[2023-02-16 16:19:48,643] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2500 is ready now! +[default1]:[2023-02-16 16:19:48,671] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2500/zero_pp_rank_0_mp_rank_01_optim_states.pt. +[default1]:[2023-02-16 16:19:48,671] [INFO] [engine.py:3199:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2500/zero_pp_rank_0_mp_rank_01_optim_states.pt +[default1]:[2023-02-16 16:19:48,671] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2500 is ready now! +[default6]:[2023-02-16 16:19:48,709] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2500/zero_pp_rank_31_mp_rank_00_optim_states.pt. +[default6]:[2023-02-16 16:19:48,709] [INFO] [engine.py:3199:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2500/zero_pp_rank_31_mp_rank_00_optim_states.pt +[default6]:[2023-02-16 16:19:48,709] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2500 is ready now! +[default6]:[2023-02-16 16:19:48,665] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2500/zero_pp_rank_19_mp_rank_00_optim_states.pt. +[default6]:[2023-02-16 16:19:48,666] [INFO] [engine.py:3199:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2500/zero_pp_rank_19_mp_rank_00_optim_states.pt +[default6]:[2023-02-16 16:19:48,666] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2500 is ready now! +[default5]:[2023-02-16 16:19:48,666] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2500/zero_pp_rank_10_mp_rank_01_optim_states.pt. +[default5]:[2023-02-16 16:19:48,667] [INFO] [engine.py:3199:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2500/zero_pp_rank_10_mp_rank_01_optim_states.pt +[default5]:[2023-02-16 16:19:48,667] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2500 is ready now! +[default6]:[2023-02-16 16:19:48,642] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2500/zero_pp_rank_23_mp_rank_00_optim_states.pt. +[default6]:[2023-02-16 16:19:48,643] [INFO] [engine.py:3199:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2500/zero_pp_rank_23_mp_rank_00_optim_states.pt +[default6]:[2023-02-16 16:19:48,643] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2500 is ready now! +[default0]:[2023-02-16 16:19:48,713] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2500/zero_pp_rank_16_mp_rank_00_optim_states.pt. +[default0]:[2023-02-16 16:19:48,713] [INFO] [engine.py:3199:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2500/zero_pp_rank_16_mp_rank_00_optim_states.pt +[default0]:[2023-02-16 16:19:48,713] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2500 is ready now! +[default4]:[2023-02-16 16:19:48,666] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2500/zero_pp_rank_18_mp_rank_00_optim_states.pt. +[default4]:[2023-02-16 16:19:48,666] [INFO] [engine.py:3199:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2500/zero_pp_rank_18_mp_rank_00_optim_states.pt +[default4]:[2023-02-16 16:19:48,666] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2500 is ready now! +[default7]:[2023-02-16 16:19:48,702] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2500/zero_pp_rank_31_mp_rank_01_optim_states.pt. +[default7]:[2023-02-16 16:19:48,702] [INFO] [engine.py:3199:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2500/zero_pp_rank_31_mp_rank_01_optim_states.pt +[default7]:[2023-02-16 16:19:48,702] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2500 is ready now! +[default6]:[2023-02-16 16:19:48,660] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2500/zero_pp_rank_3_mp_rank_00_optim_states.pt. +[default6]:[2023-02-16 16:19:48,661] [INFO] [engine.py:3199:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2500/zero_pp_rank_3_mp_rank_00_optim_states.pt +[default6]:[2023-02-16 16:19:48,661] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2500 is ready now! +[default7]:[2023-02-16 16:19:48,665] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2500/zero_pp_rank_3_mp_rank_01_optim_states.pt. +[default7]:[2023-02-16 16:19:48,665] [INFO] [engine.py:3199:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2500/zero_pp_rank_3_mp_rank_01_optim_states.pt +[default7]:[2023-02-16 16:19:48,665] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2500 is ready now! +[default3]:[2023-02-16 16:19:48,715] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2500/zero_pp_rank_13_mp_rank_01_optim_states.pt. +[default3]:[2023-02-16 16:19:48,716] [INFO] [engine.py:3199:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2500/zero_pp_rank_13_mp_rank_01_optim_states.pt +[default3]:[2023-02-16 16:19:48,716] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2500 is ready now! +[default7]:[2023-02-16 16:19:48,680] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2500/zero_pp_rank_19_mp_rank_01_optim_states.pt. +[default7]:[2023-02-16 16:19:48,680] [INFO] [engine.py:3199:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2500/zero_pp_rank_19_mp_rank_01_optim_states.pt +[default7]:[2023-02-16 16:19:48,680] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2500 is ready now! +[default0]:[2023-02-16 16:19:48,711] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2500/zero_pp_rank_12_mp_rank_00_optim_states.pt. +[default0]:[2023-02-16 16:19:48,711] [INFO] [engine.py:3199:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2500/zero_pp_rank_12_mp_rank_00_optim_states.pt +[default0]:[2023-02-16 16:19:48,711] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2500 is ready now! +[default2]:[2023-02-16 16:19:48,707] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2500/zero_pp_rank_29_mp_rank_00_optim_states.pt. +[default2]:[2023-02-16 16:19:48,707] [INFO] [engine.py:3199:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2500/zero_pp_rank_29_mp_rank_00_optim_states.pt +[default2]:[2023-02-16 16:19:48,707] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2500 is ready now! +[default5]:[2023-02-16 16:19:48,732] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2500/zero_pp_rank_26_mp_rank_01_optim_states.pt. +[default5]:[2023-02-16 16:19:48,733] [INFO] [engine.py:3199:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2500/zero_pp_rank_26_mp_rank_01_optim_states.pt +[default5]:[2023-02-16 16:19:48,733] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2500 is ready now! +[default6]:[2023-02-16 16:19:48,713] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2500/zero_pp_rank_15_mp_rank_00_optim_states.pt. +[default6]:[2023-02-16 16:19:48,713] [INFO] [engine.py:3199:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2500/zero_pp_rank_15_mp_rank_00_optim_states.pt +[default6]:[2023-02-16 16:19:48,713] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2500 is ready now! +[default1]:[2023-02-16 16:19:48,740] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2500/zero_pp_rank_4_mp_rank_01_optim_states.pt. +[default1]:[2023-02-16 16:19:48,740] [INFO] [engine.py:3199:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2500/zero_pp_rank_4_mp_rank_01_optim_states.pt +[default1]:[2023-02-16 16:19:48,740] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2500 is ready now! +[default0]:[2023-02-16 16:19:48,689] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2500/zero_pp_rank_28_mp_rank_00_optim_states.pt. +[default0]:[2023-02-16 16:19:48,689] [INFO] [engine.py:3199:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2500/zero_pp_rank_28_mp_rank_00_optim_states.pt +[default0]:[2023-02-16 16:19:48,689] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2500 is ready now! +[default1]:[2023-02-16 16:19:48,699] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2500/zero_pp_rank_28_mp_rank_01_optim_states.pt. +[default1]:[2023-02-16 16:19:48,699] [INFO] [engine.py:3199:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2500/zero_pp_rank_28_mp_rank_01_optim_states.pt +[default1]:[2023-02-16 16:19:48,700] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2500 is ready now! +[default0]:[2023-02-16 16:19:48,708] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2500/zero_pp_rank_24_mp_rank_00_optim_states.pt. +[default0]:[2023-02-16 16:19:48,709] [INFO] [engine.py:3199:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2500/zero_pp_rank_24_mp_rank_00_optim_states.pt +[default0]:[2023-02-16 16:19:48,709] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2500 is ready now! +[default1]:[2023-02-16 16:19:48,792] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2500/zero_pp_rank_20_mp_rank_01_optim_states.pt. +[default1]:[2023-02-16 16:19:48,792] [INFO] [engine.py:3199:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2500/zero_pp_rank_20_mp_rank_01_optim_states.pt +[default1]:[2023-02-16 16:19:48,792] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2500 is ready now! +[default5]:[2023-02-16 16:19:48,705] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2500/zero_pp_rank_18_mp_rank_01_optim_states.pt. +[default5]:[2023-02-16 16:19:48,705] [INFO] [engine.py:3199:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2500/zero_pp_rank_18_mp_rank_01_optim_states.pt +[default5]:[2023-02-16 16:19:48,705] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2500 is ready now! +[default2]:[2023-02-16 16:19:48,709] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2500/zero_pp_rank_17_mp_rank_00_optim_states.pt. +[default2]:[2023-02-16 16:19:48,710] [INFO] [engine.py:3199:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2500/zero_pp_rank_17_mp_rank_00_optim_states.pt +[default2]:[2023-02-16 16:19:48,710] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2500 is ready now! +[default0]:[2023-02-16 16:19:48,718] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2500/zero_pp_rank_4_mp_rank_00_optim_states.pt. +[default0]:[2023-02-16 16:19:48,718] [INFO] [engine.py:3199:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2500/zero_pp_rank_4_mp_rank_00_optim_states.pt +[default0]:[2023-02-16 16:19:48,718] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2500 is ready now! +[default5]:[2023-02-16 16:19:48,728] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2500/zero_pp_rank_6_mp_rank_01_optim_states.pt. +[default5]:[2023-02-16 16:19:48,728] [INFO] [engine.py:3199:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2500/zero_pp_rank_6_mp_rank_01_optim_states.pt +[default5]:[2023-02-16 16:19:48,728] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2500 is ready now! +[default2]:[2023-02-16 16:19:48,787] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2500/zero_pp_rank_25_mp_rank_00_optim_states.pt. +[default2]:[2023-02-16 16:19:48,787] [INFO] [engine.py:3199:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2500/zero_pp_rank_25_mp_rank_00_optim_states.pt +[default2]:[2023-02-16 16:19:48,787] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2500 is ready now! +[default2]:[2023-02-16 16:19:48,738] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2500/zero_pp_rank_13_mp_rank_00_optim_states.pt. +[default2]:[2023-02-16 16:19:48,738] [INFO] [engine.py:3199:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2500/zero_pp_rank_13_mp_rank_00_optim_states.pt +[default2]:[2023-02-16 16:19:48,738] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2500 is ready now! +[default4]:[2023-02-16 16:19:48,831] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2500/zero_pp_rank_26_mp_rank_00_optim_states.pt. +[default4]:[2023-02-16 16:19:48,831] [INFO] [engine.py:3199:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2500/zero_pp_rank_26_mp_rank_00_optim_states.pt +[default4]:[2023-02-16 16:19:48,831] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2500 is ready now! +[default6]:[2023-02-16 16:19:48,799] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2500/zero_pp_rank_27_mp_rank_00_optim_states.pt. +[default6]:[2023-02-16 16:19:48,799] [INFO] [engine.py:3199:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2500/zero_pp_rank_27_mp_rank_00_optim_states.pt +[default6]:[2023-02-16 16:19:48,799] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2500 is ready now! +[default3]:[2023-02-16 16:19:48,830] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2500/zero_pp_rank_25_mp_rank_01_optim_states.pt. +[default3]:[2023-02-16 16:19:48,830] [INFO] [engine.py:3199:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2500/zero_pp_rank_25_mp_rank_01_optim_states.pt +[default3]:[2023-02-16 16:19:48,830] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2500 is ready now! +[default1]:[2023-02-16 16:19:48,793] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2500/zero_pp_rank_24_mp_rank_01_optim_states.pt. +[default1]:[2023-02-16 16:19:48,793] [INFO] [engine.py:3199:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2500/zero_pp_rank_24_mp_rank_01_optim_states.pt +[default1]:[2023-02-16 16:19:48,793] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2500 is ready now! +[default7]:[2023-02-16 16:19:48,810] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2500/zero_pp_rank_23_mp_rank_01_optim_states.pt. +[default7]:[2023-02-16 16:19:48,810] [INFO] [engine.py:3199:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2500/zero_pp_rank_23_mp_rank_01_optim_states.pt +[default7]:[2023-02-16 16:19:48,810] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2500 is ready now! +[default5]:[2023-02-16 16:19:48,859] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2500/zero_pp_rank_22_mp_rank_01_optim_states.pt. +[default5]:[2023-02-16 16:19:48,859] [INFO] [engine.py:3199:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2500/zero_pp_rank_22_mp_rank_01_optim_states.pt +[default5]:[2023-02-16 16:19:48,859] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2500 is ready now! +[default0]: successfully saved checkpoint at iteration 2500 to /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main +[default2]:[2023-02-16 16:19:48,834] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2500/zero_pp_rank_21_mp_rank_00_optim_states.pt. +[default2]:[2023-02-16 16:19:48,834] [INFO] [engine.py:3199:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2500/zero_pp_rank_21_mp_rank_00_optim_states.pt +[default2]:[2023-02-16 16:19:48,834] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2500 is ready now! +[default7]:time (ms) | save-checkpoint: 3001.51 +[default4]:[2023-02-16 16:19:48,863] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2500/zero_pp_rank_22_mp_rank_00_optim_states.pt. +[default4]:[2023-02-16 16:19:48,863] [INFO] [engine.py:3199:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2500/zero_pp_rank_22_mp_rank_00_optim_states.pt +[default4]:[2023-02-16 16:19:48,863] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2500 is ready now! +[default7]: iteration 2501/ 3814 | consumed samples: 1280512 | consumed tokens: 2622488576 | elapsed time per iteration (s): 8.46 | learning rate: 3.159E-05 | global batch size: 512 | lm loss: 5.013082E+00 | loss scale: 131072.0 | grad norm: 0.163 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 60.534 | TFLOPs: 45.72 | +[default7]: iteration 2502/ 3814 | consumed samples: 1281024 | consumed tokens: 2623537152 | elapsed time per iteration (s): 5.46 | learning rate: 3.155E-05 | global batch size: 512 | lm loss: 4.994108E+00 | loss scale: 131072.0 | grad norm: 0.231 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.826 | TFLOPs: 70.87 | +[default7]: iteration 2503/ 3814 | consumed samples: 1281536 | consumed tokens: 2624585728 | elapsed time per iteration (s): 5.44 | learning rate: 3.151E-05 | global batch size: 512 | lm loss: 5.004553E+00 | loss scale: 131072.0 | grad norm: 0.195 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.111 | TFLOPs: 71.08 | +[default7]: iteration 2504/ 3814 | consumed samples: 1282048 | consumed tokens: 2625634304 | elapsed time per iteration (s): 5.47 | learning rate: 3.147E-05 | global batch size: 512 | lm loss: 4.998404E+00 | loss scale: 131072.0 | grad norm: 0.145 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.631 | TFLOPs: 70.72 | +[default7]: iteration 2505/ 3814 | consumed samples: 1282560 | consumed tokens: 2626682880 | elapsed time per iteration (s): 5.47 | learning rate: 3.143E-05 | global batch size: 512 | lm loss: 5.016654E+00 | loss scale: 131072.0 | grad norm: 0.205 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.683 | TFLOPs: 70.76 | +[default7]: iteration 2506/ 3814 | consumed samples: 1283072 | consumed tokens: 2627731456 | elapsed time per iteration (s): 5.49 | learning rate: 3.138E-05 | global batch size: 512 | lm loss: 5.009479E+00 | loss scale: 131072.0 | grad norm: 0.194 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.181 | TFLOPs: 70.38 | +[default7]: iteration 2507/ 3814 | consumed samples: 1283584 | consumed tokens: 2628780032 | elapsed time per iteration (s): 5.46 | learning rate: 3.134E-05 | global batch size: 512 | lm loss: 4.999732E+00 | loss scale: 131072.0 | grad norm: 0.158 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.803 | TFLOPs: 70.85 | +[default7]: iteration 2508/ 3814 | consumed samples: 1284096 | consumed tokens: 2629828608 | elapsed time per iteration (s): 5.47 | learning rate: 3.130E-05 | global batch size: 512 | lm loss: 5.025920E+00 | loss scale: 131072.0 | grad norm: 0.185 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.555 | TFLOPs: 70.66 | +[default7]: iteration 2509/ 3814 | consumed samples: 1284608 | consumed tokens: 2630877184 | elapsed time per iteration (s): 5.44 | learning rate: 3.126E-05 | global batch size: 512 | lm loss: 5.015537E+00 | loss scale: 131072.0 | grad norm: 0.165 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.172 | TFLOPs: 71.13 | +[default7]: iteration 2510/ 3814 | consumed samples: 1285120 | consumed tokens: 2631925760 | elapsed time per iteration (s): 5.46 | learning rate: 3.121E-05 | global batch size: 512 | lm loss: 4.988099E+00 | loss scale: 131072.0 | grad norm: 0.175 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.739 | TFLOPs: 70.80 | +[default7]: iteration 2511/ 3814 | consumed samples: 1285632 | consumed tokens: 2632974336 | elapsed time per iteration (s): 5.46 | learning rate: 3.117E-05 | global batch size: 512 | lm loss: 5.034081E+00 | loss scale: 131072.0 | grad norm: 0.161 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.757 | TFLOPs: 70.82 | +[default7]: iteration 2512/ 3814 | consumed samples: 1286144 | consumed tokens: 2634022912 | elapsed time per iteration (s): 5.48 | learning rate: 3.113E-05 | global batch size: 512 | lm loss: 4.996200E+00 | loss scale: 131072.0 | grad norm: 0.129 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.489 | TFLOPs: 70.61 | +[default7]: iteration 2513/ 3814 | consumed samples: 1286656 | consumed tokens: 2635071488 | elapsed time per iteration (s): 5.48 | learning rate: 3.109E-05 | global batch size: 512 | lm loss: 5.031662E+00 | loss scale: 131072.0 | grad norm: 0.169 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.387 | TFLOPs: 70.54 | +[default7]: iteration 2514/ 3814 | consumed samples: 1287168 | consumed tokens: 2636120064 | elapsed time per iteration (s): 5.48 | learning rate: 3.105E-05 | global batch size: 512 | lm loss: 5.006852E+00 | loss scale: 131072.0 | grad norm: 0.127 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.417 | TFLOPs: 70.56 | +[default7]: iteration 2515/ 3814 | consumed samples: 1287680 | consumed tokens: 2637168640 | elapsed time per iteration (s): 5.49 | learning rate: 3.100E-05 | global batch size: 512 | lm loss: 5.021169E+00 | loss scale: 131072.0 | grad norm: 0.138 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.221 | TFLOPs: 70.41 | +[default7]: iteration 2516/ 3814 | consumed samples: 1288192 | consumed tokens: 2638217216 | elapsed time per iteration (s): 5.50 | learning rate: 3.096E-05 | global batch size: 512 | lm loss: 5.020951E+00 | loss scale: 131072.0 | grad norm: 0.170 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.133 | TFLOPs: 70.34 | +[default7]: iteration 2517/ 3814 | consumed samples: 1288704 | consumed tokens: 2639265792 | elapsed time per iteration (s): 5.48 | learning rate: 3.092E-05 | global batch size: 512 | lm loss: 4.977255E+00 | loss scale: 131072.0 | grad norm: 0.135 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.443 | TFLOPs: 70.58 | +[default7]: iteration 2518/ 3814 | consumed samples: 1289216 | consumed tokens: 2640314368 | elapsed time per iteration (s): 5.47 | learning rate: 3.088E-05 | global batch size: 512 | lm loss: 5.006687E+00 | loss scale: 131072.0 | grad norm: 0.151 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.586 | TFLOPs: 70.69 | +[default7]: iteration 2519/ 3814 | consumed samples: 1289728 | consumed tokens: 2641362944 | elapsed time per iteration (s): 5.44 | learning rate: 3.084E-05 | global batch size: 512 | lm loss: 4.993551E+00 | loss scale: 131072.0 | grad norm: 0.134 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.118 | TFLOPs: 71.09 | +[default7]: iteration 2520/ 3814 | consumed samples: 1290240 | consumed tokens: 2642411520 | elapsed time per iteration (s): 5.43 | learning rate: 3.079E-05 | global batch size: 512 | lm loss: 5.019583E+00 | loss scale: 131072.0 | grad norm: 0.140 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.288 | TFLOPs: 71.22 | +[default7]: iteration 2521/ 3814 | consumed samples: 1290752 | consumed tokens: 2643460096 | elapsed time per iteration (s): 5.44 | learning rate: 3.075E-05 | global batch size: 512 | lm loss: 4.984248E+00 | loss scale: 131072.0 | grad norm: 0.139 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.102 | TFLOPs: 71.08 | +[default7]: iteration 2522/ 3814 | consumed samples: 1291264 | consumed tokens: 2644508672 | elapsed time per iteration (s): 5.43 | learning rate: 3.071E-05 | global batch size: 512 | lm loss: 5.012196E+00 | loss scale: 131072.0 | grad norm: 0.126 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.256 | TFLOPs: 71.19 | +[default7]: iteration 2523/ 3814 | consumed samples: 1291776 | consumed tokens: 2645557248 | elapsed time per iteration (s): 5.44 | learning rate: 3.067E-05 | global batch size: 512 | lm loss: 5.027071E+00 | loss scale: 131072.0 | grad norm: 0.174 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.048 | TFLOPs: 71.04 | +[default7]: iteration 2524/ 3814 | consumed samples: 1292288 | consumed tokens: 2646605824 | elapsed time per iteration (s): 5.45 | learning rate: 3.063E-05 | global batch size: 512 | lm loss: 4.996587E+00 | loss scale: 131072.0 | grad norm: 0.162 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.005 | TFLOPs: 71.00 | +[default7]: iteration 2525/ 3814 | consumed samples: 1292800 | consumed tokens: 2647654400 | elapsed time per iteration (s): 5.46 | learning rate: 3.058E-05 | global batch size: 512 | lm loss: 4.993607E+00 | loss scale: 131072.0 | grad norm: 0.136 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.826 | TFLOPs: 70.87 | +[default7]: iteration 2526/ 3814 | consumed samples: 1293312 | consumed tokens: 2648702976 | elapsed time per iteration (s): 5.43 | learning rate: 3.054E-05 | global batch size: 512 | lm loss: 5.023668E+00 | loss scale: 131072.0 | grad norm: 0.165 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.345 | TFLOPs: 71.26 | +[default7]: iteration 2527/ 3814 | consumed samples: 1293824 | consumed tokens: 2649751552 | elapsed time per iteration (s): 5.45 | learning rate: 3.050E-05 | global batch size: 512 | lm loss: 4.996908E+00 | loss scale: 131072.0 | grad norm: 0.131 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.950 | TFLOPs: 70.96 | +[default7]: iteration 2528/ 3814 | consumed samples: 1294336 | consumed tokens: 2650800128 | elapsed time per iteration (s): 5.45 | learning rate: 3.046E-05 | global batch size: 512 | lm loss: 4.995095E+00 | loss scale: 131072.0 | grad norm: 0.142 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.026 | TFLOPs: 71.02 | +[default7]: iteration 2529/ 3814 | consumed samples: 1294848 | consumed tokens: 2651848704 | elapsed time per iteration (s): 5.45 | learning rate: 3.042E-05 | global batch size: 512 | lm loss: 5.014027E+00 | loss scale: 131072.0 | grad norm: 0.109 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.012 | TFLOPs: 71.01 | +[default7]: iteration 2530/ 3814 | consumed samples: 1295360 | consumed tokens: 2652897280 | elapsed time per iteration (s): 5.44 | learning rate: 3.038E-05 | global batch size: 512 | lm loss: 4.997985E+00 | loss scale: 131072.0 | grad norm: 0.143 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.176 | TFLOPs: 71.13 | +[default7]: iteration 2531/ 3814 | consumed samples: 1295872 | consumed tokens: 2653945856 | elapsed time per iteration (s): 5.43 | learning rate: 3.033E-05 | global batch size: 512 | lm loss: 4.989199E+00 | loss scale: 131072.0 | grad norm: 0.136 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.284 | TFLOPs: 71.21 | +[default7]: iteration 2532/ 3814 | consumed samples: 1296384 | consumed tokens: 2654994432 | elapsed time per iteration (s): 5.42 | learning rate: 3.029E-05 | global batch size: 512 | lm loss: 5.003016E+00 | loss scale: 131072.0 | grad norm: 0.111 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.463 | TFLOPs: 71.35 | +[default7]: iteration 2533/ 3814 | consumed samples: 1296896 | consumed tokens: 2656043008 | elapsed time per iteration (s): 5.43 | learning rate: 3.025E-05 | global batch size: 512 | lm loss: 4.990096E+00 | loss scale: 131072.0 | grad norm: 0.135 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.359 | TFLOPs: 71.27 | +[default7]: iteration 2534/ 3814 | consumed samples: 1297408 | consumed tokens: 2657091584 | elapsed time per iteration (s): 5.45 | learning rate: 3.021E-05 | global batch size: 512 | lm loss: 5.018195E+00 | loss scale: 131072.0 | grad norm: 0.130 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.934 | TFLOPs: 70.95 | +[default7]: iteration 2535/ 3814 | consumed samples: 1297920 | consumed tokens: 2658140160 | elapsed time per iteration (s): 5.43 | learning rate: 3.017E-05 | global batch size: 512 | lm loss: 5.033711E+00 | loss scale: 131072.0 | grad norm: 0.122 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.267 | TFLOPs: 71.20 | +[default7]: iteration 2536/ 3814 | consumed samples: 1298432 | consumed tokens: 2659188736 | elapsed time per iteration (s): 5.47 | learning rate: 3.013E-05 | global batch size: 512 | lm loss: 5.010715E+00 | loss scale: 131072.0 | grad norm: 0.133 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.567 | TFLOPs: 70.67 | +[default7]: iteration 2537/ 3814 | consumed samples: 1298944 | consumed tokens: 2660237312 | elapsed time per iteration (s): 5.46 | learning rate: 3.008E-05 | global batch size: 512 | lm loss: 4.987988E+00 | loss scale: 131072.0 | grad norm: 0.109 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.740 | TFLOPs: 70.80 | +[default7]: iteration 2538/ 3814 | consumed samples: 1299456 | consumed tokens: 2661285888 | elapsed time per iteration (s): 5.43 | learning rate: 3.004E-05 | global batch size: 512 | lm loss: 4.993207E+00 | loss scale: 131072.0 | grad norm: 0.107 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.316 | TFLOPs: 71.24 | +[default7]: iteration 2539/ 3814 | consumed samples: 1299968 | consumed tokens: 2662334464 | elapsed time per iteration (s): 5.42 | learning rate: 3.000E-05 | global batch size: 512 | lm loss: 5.018769E+00 | loss scale: 131072.0 | grad norm: 0.120 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.424 | TFLOPs: 71.32 | +[default7]: iteration 2540/ 3814 | consumed samples: 1300480 | consumed tokens: 2663383040 | elapsed time per iteration (s): 5.42 | learning rate: 2.996E-05 | global batch size: 512 | lm loss: 5.012038E+00 | loss scale: 131072.0 | grad norm: 0.130 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.482 | TFLOPs: 71.36 | +[default7]: iteration 2541/ 3814 | consumed samples: 1300992 | consumed tokens: 2664431616 | elapsed time per iteration (s): 5.42 | learning rate: 2.992E-05 | global batch size: 512 | lm loss: 4.984642E+00 | loss scale: 131072.0 | grad norm: 0.157 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.386 | TFLOPs: 71.29 | +[default7]: iteration 2542/ 3814 | consumed samples: 1301504 | consumed tokens: 2665480192 | elapsed time per iteration (s): 5.42 | learning rate: 2.988E-05 | global batch size: 512 | lm loss: 5.007148E+00 | loss scale: 131072.0 | grad norm: 0.153 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.434 | TFLOPs: 71.33 | +[default7]: iteration 2543/ 3814 | consumed samples: 1302016 | consumed tokens: 2666528768 | elapsed time per iteration (s): 5.46 | learning rate: 2.983E-05 | global batch size: 512 | lm loss: 5.006105E+00 | loss scale: 131072.0 | grad norm: 0.158 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.850 | TFLOPs: 70.89 | +[default7]: iteration 2544/ 3814 | consumed samples: 1302528 | consumed tokens: 2667577344 | elapsed time per iteration (s): 5.44 | learning rate: 2.979E-05 | global batch size: 512 | lm loss: 4.994205E+00 | loss scale: 131072.0 | grad norm: 0.160 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.109 | TFLOPs: 71.08 | +[default7]: iteration 2545/ 3814 | consumed samples: 1303040 | consumed tokens: 2668625920 | elapsed time per iteration (s): 5.45 | learning rate: 2.975E-05 | global batch size: 512 | lm loss: 5.006491E+00 | loss scale: 131072.0 | grad norm: 0.126 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.867 | TFLOPs: 70.90 | +[default7]: iteration 2546/ 3814 | consumed samples: 1303552 | consumed tokens: 2669674496 | elapsed time per iteration (s): 5.47 | learning rate: 2.971E-05 | global batch size: 512 | lm loss: 4.978319E+00 | loss scale: 131072.0 | grad norm: 0.167 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.550 | TFLOPs: 70.66 | +[default7]: iteration 2547/ 3814 | consumed samples: 1304064 | consumed tokens: 2670723072 | elapsed time per iteration (s): 5.49 | learning rate: 2.967E-05 | global batch size: 512 | lm loss: 4.993232E+00 | loss scale: 131072.0 | grad norm: 0.165 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.235 | TFLOPs: 70.42 | +[default7]: iteration 2548/ 3814 | consumed samples: 1304576 | consumed tokens: 2671771648 | elapsed time per iteration (s): 5.45 | learning rate: 2.963E-05 | global batch size: 512 | lm loss: 5.005309E+00 | loss scale: 131072.0 | grad norm: 0.119 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.902 | TFLOPs: 70.93 | +[default7]: iteration 2549/ 3814 | consumed samples: 1305088 | consumed tokens: 2672820224 | elapsed time per iteration (s): 5.43 | learning rate: 2.958E-05 | global batch size: 512 | lm loss: 4.992957E+00 | loss scale: 131072.0 | grad norm: 0.144 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.347 | TFLOPs: 71.26 | +[default7]: iteration 2550/ 3814 | consumed samples: 1305600 | consumed tokens: 2673868800 | elapsed time per iteration (s): 5.47 | learning rate: 2.954E-05 | global batch size: 512 | lm loss: 5.002323E+00 | loss scale: 131072.0 | grad norm: 0.138 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.636 | TFLOPs: 70.72 | +[default7]: iteration 2551/ 3814 | consumed samples: 1306112 | consumed tokens: 2674917376 | elapsed time per iteration (s): 5.49 | learning rate: 2.950E-05 | global batch size: 512 | lm loss: 4.997252E+00 | loss scale: 131072.0 | grad norm: 0.121 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.246 | TFLOPs: 70.43 | +[default7]: iteration 2552/ 3814 | consumed samples: 1306624 | consumed tokens: 2675965952 | elapsed time per iteration (s): 5.44 | learning rate: 2.946E-05 | global batch size: 512 | lm loss: 4.994909E+00 | loss scale: 131072.0 | grad norm: 0.157 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.056 | TFLOPs: 71.04 | +[default7]: iteration 2553/ 3814 | consumed samples: 1307136 | consumed tokens: 2677014528 | elapsed time per iteration (s): 5.43 | learning rate: 2.942E-05 | global batch size: 512 | lm loss: 4.978752E+00 | loss scale: 131072.0 | grad norm: 0.144 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.232 | TFLOPs: 71.17 | +[default7]: iteration 2554/ 3814 | consumed samples: 1307648 | consumed tokens: 2678063104 | elapsed time per iteration (s): 5.43 | learning rate: 2.938E-05 | global batch size: 512 | lm loss: 4.989007E+00 | loss scale: 131072.0 | grad norm: 0.172 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.335 | TFLOPs: 71.25 | +[default7]: iteration 2555/ 3814 | consumed samples: 1308160 | consumed tokens: 2679111680 | elapsed time per iteration (s): 5.47 | learning rate: 2.934E-05 | global batch size: 512 | lm loss: 4.990509E+00 | loss scale: 131072.0 | grad norm: 0.151 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.670 | TFLOPs: 70.75 | +[default7]: iteration 2556/ 3814 | consumed samples: 1308672 | consumed tokens: 2680160256 | elapsed time per iteration (s): 5.46 | learning rate: 2.929E-05 | global batch size: 512 | lm loss: 4.999340E+00 | loss scale: 131072.0 | grad norm: 0.129 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.840 | TFLOPs: 70.88 | +[default7]: iteration 2557/ 3814 | consumed samples: 1309184 | consumed tokens: 2681208832 | elapsed time per iteration (s): 5.45 | learning rate: 2.925E-05 | global batch size: 512 | lm loss: 4.984511E+00 | loss scale: 131072.0 | grad norm: 0.161 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.951 | TFLOPs: 70.96 | +[default7]: iteration 2558/ 3814 | consumed samples: 1309696 | consumed tokens: 2682257408 | elapsed time per iteration (s): 5.44 | learning rate: 2.921E-05 | global batch size: 512 | lm loss: 4.987556E+00 | loss scale: 131072.0 | grad norm: 0.117 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.167 | TFLOPs: 71.13 | +[default7]: iteration 2559/ 3814 | consumed samples: 1310208 | consumed tokens: 2683305984 | elapsed time per iteration (s): 5.45 | learning rate: 2.917E-05 | global batch size: 512 | lm loss: 4.966791E+00 | loss scale: 131072.0 | grad norm: 0.126 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.911 | TFLOPs: 70.93 | +[default7]: iteration 2560/ 3814 | consumed samples: 1310720 | consumed tokens: 2684354560 | elapsed time per iteration (s): 5.44 | learning rate: 2.913E-05 | global batch size: 512 | lm loss: 4.985851E+00 | loss scale: 131072.0 | grad norm: 0.155 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.135 | TFLOPs: 71.10 | +[default7]: iteration 2561/ 3814 | consumed samples: 1311232 | consumed tokens: 2685403136 | elapsed time per iteration (s): 5.43 | learning rate: 2.909E-05 | global batch size: 512 | lm loss: 4.993794E+00 | loss scale: 131072.0 | grad norm: 0.126 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.205 | TFLOPs: 71.15 | +[default7]: iteration 2562/ 3814 | consumed samples: 1311744 | consumed tokens: 2686451712 | elapsed time per iteration (s): 5.43 | learning rate: 2.905E-05 | global batch size: 512 | lm loss: 4.962833E+00 | loss scale: 131072.0 | grad norm: 0.138 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.238 | TFLOPs: 71.18 | +[default7]: iteration 2563/ 3814 | consumed samples: 1312256 | consumed tokens: 2687500288 | elapsed time per iteration (s): 5.43 | learning rate: 2.901E-05 | global batch size: 512 | lm loss: 4.976786E+00 | loss scale: 131072.0 | grad norm: 0.129 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.330 | TFLOPs: 71.25 | +[default7]: iteration 2564/ 3814 | consumed samples: 1312768 | consumed tokens: 2688548864 | elapsed time per iteration (s): 5.45 | learning rate: 2.896E-05 | global batch size: 512 | lm loss: 4.974021E+00 | loss scale: 131072.0 | grad norm: 0.130 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.918 | TFLOPs: 70.94 | +[default7]: iteration 2565/ 3814 | consumed samples: 1313280 | consumed tokens: 2689597440 | elapsed time per iteration (s): 5.41 | learning rate: 2.892E-05 | global batch size: 512 | lm loss: 4.979435E+00 | loss scale: 131072.0 | grad norm: 0.161 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.592 | TFLOPs: 71.45 | +[default7]: iteration 2566/ 3814 | consumed samples: 1313792 | consumed tokens: 2690646016 | elapsed time per iteration (s): 5.44 | learning rate: 2.888E-05 | global batch size: 512 | lm loss: 5.020658E+00 | loss scale: 131072.0 | grad norm: 0.114 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.167 | TFLOPs: 71.13 | +[default7]: iteration 2567/ 3814 | consumed samples: 1314304 | consumed tokens: 2691694592 | elapsed time per iteration (s): 5.42 | learning rate: 2.884E-05 | global batch size: 512 | lm loss: 4.992925E+00 | loss scale: 131072.0 | grad norm: 0.173 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.391 | TFLOPs: 71.29 | +[default7]: iteration 2568/ 3814 | consumed samples: 1314816 | consumed tokens: 2692743168 | elapsed time per iteration (s): 5.48 | learning rate: 2.880E-05 | global batch size: 512 | lm loss: 5.010694E+00 | loss scale: 131072.0 | grad norm: 0.172 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.498 | TFLOPs: 70.62 | +[default7]: iteration 2569/ 3814 | consumed samples: 1315328 | consumed tokens: 2693791744 | elapsed time per iteration (s): 5.48 | learning rate: 2.876E-05 | global batch size: 512 | lm loss: 4.986499E+00 | loss scale: 131072.0 | grad norm: 0.166 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.354 | TFLOPs: 70.51 | +[default7]: iteration 2570/ 3814 | consumed samples: 1315840 | consumed tokens: 2694840320 | elapsed time per iteration (s): 5.48 | learning rate: 2.872E-05 | global batch size: 512 | lm loss: 4.990359E+00 | loss scale: 131072.0 | grad norm: 0.153 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.412 | TFLOPs: 70.55 | +[default7]: iteration 2571/ 3814 | consumed samples: 1316352 | consumed tokens: 2695888896 | elapsed time per iteration (s): 5.46 | learning rate: 2.868E-05 | global batch size: 512 | lm loss: 4.998248E+00 | loss scale: 131072.0 | grad norm: 0.140 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.771 | TFLOPs: 70.83 | +[default7]: iteration 2572/ 3814 | consumed samples: 1316864 | consumed tokens: 2696937472 | elapsed time per iteration (s): 5.43 | learning rate: 2.864E-05 | global batch size: 512 | lm loss: 5.000885E+00 | loss scale: 131072.0 | grad norm: 0.171 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.260 | TFLOPs: 71.20 | +[default7]: iteration 2573/ 3814 | consumed samples: 1317376 | consumed tokens: 2697986048 | elapsed time per iteration (s): 5.44 | learning rate: 2.859E-05 | global batch size: 512 | lm loss: 4.975523E+00 | loss scale: 131072.0 | grad norm: 0.133 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.171 | TFLOPs: 71.13 | +[default7]: iteration 2574/ 3814 | consumed samples: 1317888 | consumed tokens: 2699034624 | elapsed time per iteration (s): 5.45 | learning rate: 2.855E-05 | global batch size: 512 | lm loss: 5.011574E+00 | loss scale: 131072.0 | grad norm: 0.131 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.005 | TFLOPs: 71.00 | +[default7]: iteration 2575/ 3814 | consumed samples: 1318400 | consumed tokens: 2700083200 | elapsed time per iteration (s): 5.45 | learning rate: 2.851E-05 | global batch size: 512 | lm loss: 4.969366E+00 | loss scale: 131072.0 | grad norm: 0.129 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.980 | TFLOPs: 70.98 | +[default7]: iteration 2576/ 3814 | consumed samples: 1318912 | consumed tokens: 2701131776 | elapsed time per iteration (s): 5.43 | learning rate: 2.847E-05 | global batch size: 512 | lm loss: 4.990157E+00 | loss scale: 131072.0 | grad norm: 0.128 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.208 | TFLOPs: 71.16 | +[default7]: iteration 2577/ 3814 | consumed samples: 1319424 | consumed tokens: 2702180352 | elapsed time per iteration (s): 5.44 | learning rate: 2.843E-05 | global batch size: 512 | lm loss: 5.007734E+00 | loss scale: 131072.0 | grad norm: 0.122 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.100 | TFLOPs: 71.07 | +[default7]: iteration 2578/ 3814 | consumed samples: 1319936 | consumed tokens: 2703228928 | elapsed time per iteration (s): 5.44 | learning rate: 2.839E-05 | global batch size: 512 | lm loss: 5.002987E+00 | loss scale: 131072.0 | grad norm: 0.141 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.144 | TFLOPs: 71.11 | +[default7]: iteration 2579/ 3814 | consumed samples: 1320448 | consumed tokens: 2704277504 | elapsed time per iteration (s): 5.43 | learning rate: 2.835E-05 | global batch size: 512 | lm loss: 5.016034E+00 | loss scale: 131072.0 | grad norm: 0.201 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.362 | TFLOPs: 71.27 | +[default7]: iteration 2580/ 3814 | consumed samples: 1320960 | consumed tokens: 2705326080 | elapsed time per iteration (s): 5.42 | learning rate: 2.831E-05 | global batch size: 512 | lm loss: 4.985241E+00 | loss scale: 131072.0 | grad norm: 0.152 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.379 | TFLOPs: 71.29 | +[default7]: iteration 2581/ 3814 | consumed samples: 1321472 | consumed tokens: 2706374656 | elapsed time per iteration (s): 5.46 | learning rate: 2.827E-05 | global batch size: 512 | lm loss: 4.976392E+00 | loss scale: 131072.0 | grad norm: 0.146 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.835 | TFLOPs: 70.87 | +[default7]: iteration 2582/ 3814 | consumed samples: 1321984 | consumed tokens: 2707423232 | elapsed time per iteration (s): 5.44 | learning rate: 2.823E-05 | global batch size: 512 | lm loss: 4.990312E+00 | loss scale: 131072.0 | grad norm: 0.122 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.193 | TFLOPs: 71.14 | +[default7]: iteration 2583/ 3814 | consumed samples: 1322496 | consumed tokens: 2708471808 | elapsed time per iteration (s): 5.43 | learning rate: 2.818E-05 | global batch size: 512 | lm loss: 4.969501E+00 | loss scale: 131072.0 | grad norm: 0.128 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.259 | TFLOPs: 71.19 | +[default7]: iteration 2584/ 3814 | consumed samples: 1323008 | consumed tokens: 2709520384 | elapsed time per iteration (s): 5.43 | learning rate: 2.814E-05 | global batch size: 512 | lm loss: 4.968843E+00 | loss scale: 131072.0 | grad norm: 0.123 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.346 | TFLOPs: 71.26 | +[default7]: iteration 2585/ 3814 | consumed samples: 1323520 | consumed tokens: 2710568960 | elapsed time per iteration (s): 5.45 | learning rate: 2.810E-05 | global batch size: 512 | lm loss: 4.984022E+00 | loss scale: 131072.0 | grad norm: 0.115 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.922 | TFLOPs: 70.94 | +[default7]: iteration 2586/ 3814 | consumed samples: 1324032 | consumed tokens: 2711617536 | elapsed time per iteration (s): 5.45 | learning rate: 2.806E-05 | global batch size: 512 | lm loss: 4.964565E+00 | loss scale: 131072.0 | grad norm: 0.120 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.030 | TFLOPs: 71.02 | +[default7]: iteration 2587/ 3814 | consumed samples: 1324544 | consumed tokens: 2712666112 | elapsed time per iteration (s): 5.43 | learning rate: 2.802E-05 | global batch size: 512 | lm loss: 4.972423E+00 | loss scale: 131072.0 | grad norm: 0.156 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.259 | TFLOPs: 71.19 | +[default7]: iteration 2588/ 3814 | consumed samples: 1325056 | consumed tokens: 2713714688 | elapsed time per iteration (s): 5.45 | learning rate: 2.798E-05 | global batch size: 512 | lm loss: 5.003235E+00 | loss scale: 131072.0 | grad norm: 0.125 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.002 | TFLOPs: 71.00 | +[default7]: iteration 2589/ 3814 | consumed samples: 1325568 | consumed tokens: 2714763264 | elapsed time per iteration (s): 5.46 | learning rate: 2.794E-05 | global batch size: 512 | lm loss: 4.977264E+00 | loss scale: 131072.0 | grad norm: 0.141 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.852 | TFLOPs: 70.89 | +[default7]: iteration 2590/ 3814 | consumed samples: 1326080 | consumed tokens: 2715811840 | elapsed time per iteration (s): 5.47 | learning rate: 2.790E-05 | global batch size: 512 | lm loss: 5.002948E+00 | loss scale: 131072.0 | grad norm: 0.157 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.642 | TFLOPs: 70.73 | +[default7]: iteration 2591/ 3814 | consumed samples: 1326592 | consumed tokens: 2716860416 | elapsed time per iteration (s): 5.46 | learning rate: 2.786E-05 | global batch size: 512 | lm loss: 4.988743E+00 | loss scale: 131072.0 | grad norm: 0.151 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.697 | TFLOPs: 70.77 | +[default7]: iteration 2592/ 3814 | consumed samples: 1327104 | consumed tokens: 2717908992 | elapsed time per iteration (s): 5.48 | learning rate: 2.782E-05 | global batch size: 512 | lm loss: 4.969883E+00 | loss scale: 131072.0 | grad norm: 0.111 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.434 | TFLOPs: 70.57 | +[default7]: iteration 2593/ 3814 | consumed samples: 1327616 | consumed tokens: 2718957568 | elapsed time per iteration (s): 5.50 | learning rate: 2.778E-05 | global batch size: 512 | lm loss: 4.987430E+00 | loss scale: 131072.0 | grad norm: 0.138 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.169 | TFLOPs: 70.37 | +[default7]: iteration 2594/ 3814 | consumed samples: 1328128 | consumed tokens: 2720006144 | elapsed time per iteration (s): 5.43 | learning rate: 2.774E-05 | global batch size: 512 | lm loss: 4.994323E+00 | loss scale: 131072.0 | grad norm: 0.178 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.279 | TFLOPs: 71.21 | +[default7]: iteration 2595/ 3814 | consumed samples: 1328640 | consumed tokens: 2721054720 | elapsed time per iteration (s): 5.45 | learning rate: 2.769E-05 | global batch size: 512 | lm loss: 5.023943E+00 | loss scale: 131072.0 | grad norm: 0.154 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.948 | TFLOPs: 70.96 | +[default7]: iteration 2596/ 3814 | consumed samples: 1329152 | consumed tokens: 2722103296 | elapsed time per iteration (s): 5.42 | learning rate: 2.765E-05 | global batch size: 512 | lm loss: 4.970465E+00 | loss scale: 131072.0 | grad norm: 0.134 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.382 | TFLOPs: 71.29 | +[default7]: iteration 2597/ 3814 | consumed samples: 1329664 | consumed tokens: 2723151872 | elapsed time per iteration (s): 5.44 | learning rate: 2.761E-05 | global batch size: 512 | lm loss: 4.983717E+00 | loss scale: 131072.0 | grad norm: 0.125 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.110 | TFLOPs: 71.08 | +[default7]: iteration 2598/ 3814 | consumed samples: 1330176 | consumed tokens: 2724200448 | elapsed time per iteration (s): 5.46 | learning rate: 2.757E-05 | global batch size: 512 | lm loss: 4.989784E+00 | loss scale: 131072.0 | grad norm: 0.131 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.848 | TFLOPs: 70.88 | +[default7]: iteration 2599/ 3814 | consumed samples: 1330688 | consumed tokens: 2725249024 | elapsed time per iteration (s): 5.44 | learning rate: 2.753E-05 | global batch size: 512 | lm loss: 5.000579E+00 | loss scale: 131072.0 | grad norm: 0.123 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.045 | TFLOPs: 71.03 | +[default7]: iteration 2600/ 3814 | consumed samples: 1331200 | consumed tokens: 2726297600 | elapsed time per iteration (s): 5.44 | learning rate: 2.749E-05 | global batch size: 512 | lm loss: 4.969972E+00 | loss scale: 131072.0 | grad norm: 0.131 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.078 | TFLOPs: 71.06 | +[default7]: iteration 2601/ 3814 | consumed samples: 1331712 | consumed tokens: 2727346176 | elapsed time per iteration (s): 5.44 | learning rate: 2.745E-05 | global batch size: 512 | lm loss: 4.986989E+00 | loss scale: 131072.0 | grad norm: 0.124 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.132 | TFLOPs: 71.10 | +[default7]: iteration 2602/ 3814 | consumed samples: 1332224 | consumed tokens: 2728394752 | elapsed time per iteration (s): 5.42 | learning rate: 2.741E-05 | global batch size: 512 | lm loss: 4.985002E+00 | loss scale: 131072.0 | grad norm: 0.135 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.537 | TFLOPs: 71.40 | +[default7]: iteration 2603/ 3814 | consumed samples: 1332736 | consumed tokens: 2729443328 | elapsed time per iteration (s): 5.46 | learning rate: 2.737E-05 | global batch size: 512 | lm loss: 4.985905E+00 | loss scale: 131072.0 | grad norm: 0.136 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.720 | TFLOPs: 70.79 | +[default7]: iteration 2604/ 3814 | consumed samples: 1333248 | consumed tokens: 2730491904 | elapsed time per iteration (s): 5.44 | learning rate: 2.733E-05 | global batch size: 512 | lm loss: 4.964533E+00 | loss scale: 131072.0 | grad norm: 0.116 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.086 | TFLOPs: 71.06 | +[default7]: iteration 2605/ 3814 | consumed samples: 1333760 | consumed tokens: 2731540480 | elapsed time per iteration (s): 5.43 | learning rate: 2.729E-05 | global batch size: 512 | lm loss: 4.960235E+00 | loss scale: 131072.0 | grad norm: 0.142 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.235 | TFLOPs: 71.18 | +[default7]: iteration 2606/ 3814 | consumed samples: 1334272 | consumed tokens: 2732589056 | elapsed time per iteration (s): 5.43 | learning rate: 2.725E-05 | global batch size: 512 | lm loss: 4.969430E+00 | loss scale: 131072.0 | grad norm: 0.157 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.272 | TFLOPs: 71.20 | +[default7]: iteration 2607/ 3814 | consumed samples: 1334784 | consumed tokens: 2733637632 | elapsed time per iteration (s): 5.44 | learning rate: 2.721E-05 | global batch size: 512 | lm loss: 4.993282E+00 | loss scale: 131072.0 | grad norm: 0.139 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.176 | TFLOPs: 71.13 | +[default7]: iteration 2608/ 3814 | consumed samples: 1335296 | consumed tokens: 2734686208 | elapsed time per iteration (s): 5.45 | learning rate: 2.717E-05 | global batch size: 512 | lm loss: 4.945123E+00 | loss scale: 131072.0 | grad norm: 0.118 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.977 | TFLOPs: 70.98 | +[default7]: iteration 2609/ 3814 | consumed samples: 1335808 | consumed tokens: 2735734784 | elapsed time per iteration (s): 5.42 | learning rate: 2.713E-05 | global batch size: 512 | lm loss: 4.987716E+00 | loss scale: 131072.0 | grad norm: 0.118 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.387 | TFLOPs: 71.29 | +[default7]: iteration 2610/ 3814 | consumed samples: 1336320 | consumed tokens: 2736783360 | elapsed time per iteration (s): 5.44 | learning rate: 2.709E-05 | global batch size: 512 | lm loss: 4.988438E+00 | loss scale: 131072.0 | grad norm: 0.148 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.162 | TFLOPs: 71.12 | +[default7]: iteration 2611/ 3814 | consumed samples: 1336832 | consumed tokens: 2737831936 | elapsed time per iteration (s): 5.42 | learning rate: 2.705E-05 | global batch size: 512 | lm loss: 4.980920E+00 | loss scale: 131072.0 | grad norm: 0.137 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.473 | TFLOPs: 71.36 | +[default7]: iteration 2612/ 3814 | consumed samples: 1337344 | consumed tokens: 2738880512 | elapsed time per iteration (s): 5.43 | learning rate: 2.701E-05 | global batch size: 512 | lm loss: 4.985119E+00 | loss scale: 131072.0 | grad norm: 0.122 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.309 | TFLOPs: 71.23 | +[default7]: iteration 2613/ 3814 | consumed samples: 1337856 | consumed tokens: 2739929088 | elapsed time per iteration (s): 5.44 | learning rate: 2.697E-05 | global batch size: 512 | lm loss: 4.968342E+00 | loss scale: 131072.0 | grad norm: 0.133 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.183 | TFLOPs: 71.14 | +[default7]: iteration 2614/ 3814 | consumed samples: 1338368 | consumed tokens: 2740977664 | elapsed time per iteration (s): 5.42 | learning rate: 2.693E-05 | global batch size: 512 | lm loss: 4.958812E+00 | loss scale: 131072.0 | grad norm: 0.116 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.451 | TFLOPs: 71.34 | +[default7]: iteration 2615/ 3814 | consumed samples: 1338880 | consumed tokens: 2742026240 | elapsed time per iteration (s): 5.42 | learning rate: 2.689E-05 | global batch size: 512 | lm loss: 4.969024E+00 | loss scale: 131072.0 | grad norm: 0.119 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.524 | TFLOPs: 71.40 | +[default7]: iteration 2616/ 3814 | consumed samples: 1339392 | consumed tokens: 2743074816 | elapsed time per iteration (s): 5.42 | learning rate: 2.685E-05 | global batch size: 512 | lm loss: 4.973274E+00 | loss scale: 131072.0 | grad norm: 0.144 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.432 | TFLOPs: 71.33 | +[default7]: iteration 2617/ 3814 | consumed samples: 1339904 | consumed tokens: 2744123392 | elapsed time per iteration (s): 5.40 | learning rate: 2.680E-05 | global batch size: 512 | lm loss: 4.988283E+00 | loss scale: 131072.0 | grad norm: 0.114 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.829 | TFLOPs: 71.63 | +[default7]: iteration 2618/ 3814 | consumed samples: 1340416 | consumed tokens: 2745171968 | elapsed time per iteration (s): 5.46 | learning rate: 2.676E-05 | global batch size: 512 | lm loss: 4.978752E+00 | loss scale: 131072.0 | grad norm: 0.114 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.726 | TFLOPs: 70.79 | +[default7]: iteration 2619/ 3814 | consumed samples: 1340928 | consumed tokens: 2746220544 | elapsed time per iteration (s): 5.40 | learning rate: 2.672E-05 | global batch size: 512 | lm loss: 4.981936E+00 | loss scale: 131072.0 | grad norm: 0.139 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.736 | TFLOPs: 71.55 | +[default7]: iteration 2620/ 3814 | consumed samples: 1341440 | consumed tokens: 2747269120 | elapsed time per iteration (s): 5.45 | learning rate: 2.668E-05 | global batch size: 512 | lm loss: 4.960586E+00 | loss scale: 131072.0 | grad norm: 0.125 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.964 | TFLOPs: 70.97 | +[default7]: iteration 2621/ 3814 | consumed samples: 1341952 | consumed tokens: 2748317696 | elapsed time per iteration (s): 5.44 | learning rate: 2.664E-05 | global batch size: 512 | lm loss: 4.990486E+00 | loss scale: 131072.0 | grad norm: 0.125 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.149 | TFLOPs: 71.11 | +[default7]: iteration 2622/ 3814 | consumed samples: 1342464 | consumed tokens: 2749366272 | elapsed time per iteration (s): 5.43 | learning rate: 2.660E-05 | global batch size: 512 | lm loss: 4.951721E+00 | loss scale: 131072.0 | grad norm: 0.141 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.218 | TFLOPs: 71.16 | +[default7]: iteration 2623/ 3814 | consumed samples: 1342976 | consumed tokens: 2750414848 | elapsed time per iteration (s): 5.43 | learning rate: 2.656E-05 | global batch size: 512 | lm loss: 4.978883E+00 | loss scale: 131072.0 | grad norm: 0.108 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.365 | TFLOPs: 71.27 | +[default7]: iteration 2624/ 3814 | consumed samples: 1343488 | consumed tokens: 2751463424 | elapsed time per iteration (s): 5.42 | learning rate: 2.652E-05 | global batch size: 512 | lm loss: 4.975100E+00 | loss scale: 131072.0 | grad norm: 0.142 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.406 | TFLOPs: 71.31 | +[default7]: iteration 2625/ 3814 | consumed samples: 1344000 | consumed tokens: 2752512000 | elapsed time per iteration (s): 5.42 | learning rate: 2.648E-05 | global batch size: 512 | lm loss: 4.973384E+00 | loss scale: 131072.0 | grad norm: 0.136 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.543 | TFLOPs: 71.41 | +[default7]: iteration 2626/ 3814 | consumed samples: 1344512 | consumed tokens: 2753560576 | elapsed time per iteration (s): 5.43 | learning rate: 2.644E-05 | global batch size: 512 | lm loss: 4.977041E+00 | loss scale: 131072.0 | grad norm: 0.125 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.364 | TFLOPs: 71.27 | +[default7]: iteration 2627/ 3814 | consumed samples: 1345024 | consumed tokens: 2754609152 | elapsed time per iteration (s): 5.42 | learning rate: 2.640E-05 | global batch size: 512 | lm loss: 4.965394E+00 | loss scale: 131072.0 | grad norm: 0.129 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.470 | TFLOPs: 71.35 | +[default7]: iteration 2628/ 3814 | consumed samples: 1345536 | consumed tokens: 2755657728 | elapsed time per iteration (s): 5.42 | learning rate: 2.636E-05 | global batch size: 512 | lm loss: 4.994899E+00 | loss scale: 131072.0 | grad norm: 0.095 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.420 | TFLOPs: 71.32 | +[default7]: iteration 2629/ 3814 | consumed samples: 1346048 | consumed tokens: 2756706304 | elapsed time per iteration (s): 5.46 | learning rate: 2.632E-05 | global batch size: 512 | lm loss: 4.951338E+00 | loss scale: 131072.0 | grad norm: 0.134 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.842 | TFLOPs: 70.88 | +[default7]: iteration 2630/ 3814 | consumed samples: 1346560 | consumed tokens: 2757754880 | elapsed time per iteration (s): 5.42 | learning rate: 2.628E-05 | global batch size: 512 | lm loss: 4.975976E+00 | loss scale: 131072.0 | grad norm: 0.143 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.430 | TFLOPs: 71.32 | +[default7]: iteration 2631/ 3814 | consumed samples: 1347072 | consumed tokens: 2758803456 | elapsed time per iteration (s): 5.45 | learning rate: 2.624E-05 | global batch size: 512 | lm loss: 4.982203E+00 | loss scale: 131072.0 | grad norm: 0.104 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.010 | TFLOPs: 71.01 | +[default7]: iteration 2632/ 3814 | consumed samples: 1347584 | consumed tokens: 2759852032 | elapsed time per iteration (s): 5.42 | learning rate: 2.620E-05 | global batch size: 512 | lm loss: 5.002073E+00 | loss scale: 131072.0 | grad norm: 0.131 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.527 | TFLOPs: 71.40 | +[default7]: iteration 2633/ 3814 | consumed samples: 1348096 | consumed tokens: 2760900608 | elapsed time per iteration (s): 5.41 | learning rate: 2.616E-05 | global batch size: 512 | lm loss: 4.973094E+00 | loss scale: 131072.0 | grad norm: 0.170 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.585 | TFLOPs: 71.44 | +[default7]: iteration 2634/ 3814 | consumed samples: 1348608 | consumed tokens: 2761949184 | elapsed time per iteration (s): 5.38 | learning rate: 2.612E-05 | global batch size: 512 | lm loss: 4.960553E+00 | loss scale: 131072.0 | grad norm: 0.129 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 95.162 | TFLOPs: 71.88 | +[default7]: iteration 2635/ 3814 | consumed samples: 1349120 | consumed tokens: 2762997760 | elapsed time per iteration (s): 5.44 | learning rate: 2.608E-05 | global batch size: 512 | lm loss: 4.979639E+00 | loss scale: 131072.0 | grad norm: 0.136 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.036 | TFLOPs: 71.03 | +[default7]: iteration 2636/ 3814 | consumed samples: 1349632 | consumed tokens: 2764046336 | elapsed time per iteration (s): 5.41 | learning rate: 2.604E-05 | global batch size: 512 | lm loss: 4.951200E+00 | loss scale: 131072.0 | grad norm: 0.181 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.719 | TFLOPs: 71.54 | +[default7]: iteration 2637/ 3814 | consumed samples: 1350144 | consumed tokens: 2765094912 | elapsed time per iteration (s): 5.41 | learning rate: 2.600E-05 | global batch size: 512 | lm loss: 4.978715E+00 | loss scale: 131072.0 | grad norm: 0.132 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.595 | TFLOPs: 71.45 | +[default7]: iteration 2638/ 3814 | consumed samples: 1350656 | consumed tokens: 2766143488 | elapsed time per iteration (s): 5.41 | learning rate: 2.596E-05 | global batch size: 512 | lm loss: 4.976398E+00 | loss scale: 131072.0 | grad norm: 0.175 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.633 | TFLOPs: 71.48 | +[default7]: iteration 2639/ 3814 | consumed samples: 1351168 | consumed tokens: 2767192064 | elapsed time per iteration (s): 5.41 | learning rate: 2.592E-05 | global batch size: 512 | lm loss: 4.967465E+00 | loss scale: 131072.0 | grad norm: 0.185 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.668 | TFLOPs: 71.50 | +[default7]: iteration 2640/ 3814 | consumed samples: 1351680 | consumed tokens: 2768240640 | elapsed time per iteration (s): 5.42 | learning rate: 2.588E-05 | global batch size: 512 | lm loss: 4.963664E+00 | loss scale: 131072.0 | grad norm: 0.116 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.429 | TFLOPs: 71.32 | +[default7]: iteration 2641/ 3814 | consumed samples: 1352192 | consumed tokens: 2769289216 | elapsed time per iteration (s): 5.41 | learning rate: 2.584E-05 | global batch size: 512 | lm loss: 4.952130E+00 | loss scale: 131072.0 | grad norm: 0.211 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.558 | TFLOPs: 71.42 | +[default7]: iteration 2642/ 3814 | consumed samples: 1352704 | consumed tokens: 2770337792 | elapsed time per iteration (s): 5.42 | learning rate: 2.580E-05 | global batch size: 512 | lm loss: 4.979561E+00 | loss scale: 131072.0 | grad norm: 0.167 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.401 | TFLOPs: 71.30 | +[default7]: iteration 2643/ 3814 | consumed samples: 1353216 | consumed tokens: 2771386368 | elapsed time per iteration (s): 5.44 | learning rate: 2.576E-05 | global batch size: 512 | lm loss: 4.955739E+00 | loss scale: 131072.0 | grad norm: 0.154 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.147 | TFLOPs: 71.11 | +[default7]: iteration 2644/ 3814 | consumed samples: 1353728 | consumed tokens: 2772434944 | elapsed time per iteration (s): 5.44 | learning rate: 2.573E-05 | global batch size: 512 | lm loss: 4.967755E+00 | loss scale: 131072.0 | grad norm: 0.186 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.166 | TFLOPs: 71.12 | +[default7]: iteration 2645/ 3814 | consumed samples: 1354240 | consumed tokens: 2773483520 | elapsed time per iteration (s): 5.45 | learning rate: 2.569E-05 | global batch size: 512 | lm loss: 4.963408E+00 | loss scale: 131072.0 | grad norm: 0.133 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.027 | TFLOPs: 71.02 | +[default7]: iteration 2646/ 3814 | consumed samples: 1354752 | consumed tokens: 2774532096 | elapsed time per iteration (s): 5.44 | learning rate: 2.565E-05 | global batch size: 512 | lm loss: 4.948385E+00 | loss scale: 131072.0 | grad norm: 0.159 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.127 | TFLOPs: 71.10 | +[default7]: iteration 2647/ 3814 | consumed samples: 1355264 | consumed tokens: 2775580672 | elapsed time per iteration (s): 5.44 | learning rate: 2.561E-05 | global batch size: 512 | lm loss: 4.971186E+00 | loss scale: 131072.0 | grad norm: 0.125 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.069 | TFLOPs: 71.05 | +[default7]: iteration 2648/ 3814 | consumed samples: 1355776 | consumed tokens: 2776629248 | elapsed time per iteration (s): 5.44 | learning rate: 2.557E-05 | global batch size: 512 | lm loss: 4.950398E+00 | loss scale: 131072.0 | grad norm: 0.117 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.097 | TFLOPs: 71.07 | +[default7]: iteration 2649/ 3814 | consumed samples: 1356288 | consumed tokens: 2777677824 | elapsed time per iteration (s): 5.42 | learning rate: 2.553E-05 | global batch size: 512 | lm loss: 4.963561E+00 | loss scale: 131072.0 | grad norm: 0.116 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.414 | TFLOPs: 71.31 | +[default7]: iteration 2650/ 3814 | consumed samples: 1356800 | consumed tokens: 2778726400 | elapsed time per iteration (s): 5.42 | learning rate: 2.549E-05 | global batch size: 512 | lm loss: 4.944726E+00 | loss scale: 131072.0 | grad norm: 0.106 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.541 | TFLOPs: 71.41 | +[default7]: iteration 2651/ 3814 | consumed samples: 1357312 | consumed tokens: 2779774976 | elapsed time per iteration (s): 5.43 | learning rate: 2.545E-05 | global batch size: 512 | lm loss: 4.931002E+00 | loss scale: 131072.0 | grad norm: 0.123 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.358 | TFLOPs: 71.27 | +[default7]: iteration 2652/ 3814 | consumed samples: 1357824 | consumed tokens: 2780823552 | elapsed time per iteration (s): 5.43 | learning rate: 2.541E-05 | global batch size: 512 | lm loss: 4.974319E+00 | loss scale: 131072.0 | grad norm: 0.137 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.350 | TFLOPs: 71.26 | +[default7]: iteration 2653/ 3814 | consumed samples: 1358336 | consumed tokens: 2781872128 | elapsed time per iteration (s): 5.44 | learning rate: 2.537E-05 | global batch size: 512 | lm loss: 4.978077E+00 | loss scale: 131072.0 | grad norm: 0.127 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.068 | TFLOPs: 71.05 | +[default7]: iteration 2654/ 3814 | consumed samples: 1358848 | consumed tokens: 2782920704 | elapsed time per iteration (s): 5.47 | learning rate: 2.533E-05 | global batch size: 512 | lm loss: 4.971214E+00 | loss scale: 131072.0 | grad norm: 0.100 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.651 | TFLOPs: 70.74 | +[default7]: iteration 2655/ 3814 | consumed samples: 1359360 | consumed tokens: 2783969280 | elapsed time per iteration (s): 5.46 | learning rate: 2.529E-05 | global batch size: 512 | lm loss: 4.964517E+00 | loss scale: 131072.0 | grad norm: 0.121 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.828 | TFLOPs: 70.87 | +[default7]: iteration 2656/ 3814 | consumed samples: 1359872 | consumed tokens: 2785017856 | elapsed time per iteration (s): 5.42 | learning rate: 2.525E-05 | global batch size: 512 | lm loss: 4.971586E+00 | loss scale: 131072.0 | grad norm: 0.129 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.398 | TFLOPs: 71.30 | +[default7]: iteration 2657/ 3814 | consumed samples: 1360384 | consumed tokens: 2786066432 | elapsed time per iteration (s): 5.46 | learning rate: 2.521E-05 | global batch size: 512 | lm loss: 4.967853E+00 | loss scale: 131072.0 | grad norm: 0.125 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.746 | TFLOPs: 70.81 | +[default7]: iteration 2658/ 3814 | consumed samples: 1360896 | consumed tokens: 2787115008 | elapsed time per iteration (s): 5.43 | learning rate: 2.517E-05 | global batch size: 512 | lm loss: 4.981406E+00 | loss scale: 131072.0 | grad norm: 0.122 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.272 | TFLOPs: 71.20 | +[default7]: iteration 2659/ 3814 | consumed samples: 1361408 | consumed tokens: 2788163584 | elapsed time per iteration (s): 5.42 | learning rate: 2.513E-05 | global batch size: 512 | lm loss: 4.978923E+00 | loss scale: 131072.0 | grad norm: 0.182 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.438 | TFLOPs: 71.33 | +[default7]: iteration 2660/ 3814 | consumed samples: 1361920 | consumed tokens: 2789212160 | elapsed time per iteration (s): 5.46 | learning rate: 2.509E-05 | global batch size: 512 | lm loss: 4.976071E+00 | loss scale: 131072.0 | grad norm: 0.165 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.795 | TFLOPs: 70.84 | +[default7]: iteration 2661/ 3814 | consumed samples: 1362432 | consumed tokens: 2790260736 | elapsed time per iteration (s): 5.44 | learning rate: 2.505E-05 | global batch size: 512 | lm loss: 4.977333E+00 | loss scale: 131072.0 | grad norm: 0.123 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.134 | TFLOPs: 71.10 | +[default7]: iteration 2662/ 3814 | consumed samples: 1362944 | consumed tokens: 2791309312 | elapsed time per iteration (s): 5.45 | learning rate: 2.501E-05 | global batch size: 512 | lm loss: 4.973563E+00 | loss scale: 131072.0 | grad norm: 0.172 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.020 | TFLOPs: 71.01 | +[default7]: iteration 2663/ 3814 | consumed samples: 1363456 | consumed tokens: 2792357888 | elapsed time per iteration (s): 5.47 | learning rate: 2.497E-05 | global batch size: 512 | lm loss: 4.951107E+00 | loss scale: 131072.0 | grad norm: 0.155 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.632 | TFLOPs: 70.72 | +[default7]: iteration 2664/ 3814 | consumed samples: 1363968 | consumed tokens: 2793406464 | elapsed time per iteration (s): 5.45 | learning rate: 2.493E-05 | global batch size: 512 | lm loss: 4.935547E+00 | loss scale: 131072.0 | grad norm: 0.143 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.878 | TFLOPs: 70.91 | +[default7]: iteration 2665/ 3814 | consumed samples: 1364480 | consumed tokens: 2794455040 | elapsed time per iteration (s): 5.48 | learning rate: 2.490E-05 | global batch size: 512 | lm loss: 4.960285E+00 | loss scale: 131072.0 | grad norm: 0.173 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.458 | TFLOPs: 70.59 | +[default7]: iteration 2666/ 3814 | consumed samples: 1364992 | consumed tokens: 2795503616 | elapsed time per iteration (s): 5.44 | learning rate: 2.486E-05 | global batch size: 512 | lm loss: 4.963805E+00 | loss scale: 131072.0 | grad norm: 0.110 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.127 | TFLOPs: 71.10 | +[default7]: iteration 2667/ 3814 | consumed samples: 1365504 | consumed tokens: 2796552192 | elapsed time per iteration (s): 5.45 | learning rate: 2.482E-05 | global batch size: 512 | lm loss: 4.967998E+00 | loss scale: 131072.0 | grad norm: 0.157 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.986 | TFLOPs: 70.99 | +[default7]: iteration 2668/ 3814 | consumed samples: 1366016 | consumed tokens: 2797600768 | elapsed time per iteration (s): 5.47 | learning rate: 2.478E-05 | global batch size: 512 | lm loss: 4.965756E+00 | loss scale: 131072.0 | grad norm: 0.108 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.618 | TFLOPs: 70.71 | +[default7]: iteration 2669/ 3814 | consumed samples: 1366528 | consumed tokens: 2798649344 | elapsed time per iteration (s): 5.45 | learning rate: 2.474E-05 | global batch size: 512 | lm loss: 4.958941E+00 | loss scale: 131072.0 | grad norm: 0.121 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.009 | TFLOPs: 71.01 | +[default7]: iteration 2670/ 3814 | consumed samples: 1367040 | consumed tokens: 2799697920 | elapsed time per iteration (s): 5.47 | learning rate: 2.470E-05 | global batch size: 512 | lm loss: 4.953010E+00 | loss scale: 131072.0 | grad norm: 0.108 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.668 | TFLOPs: 70.75 | +[default7]: iteration 2671/ 3814 | consumed samples: 1367552 | consumed tokens: 2800746496 | elapsed time per iteration (s): 5.44 | learning rate: 2.466E-05 | global batch size: 512 | lm loss: 4.964894E+00 | loss scale: 131072.0 | grad norm: 0.107 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.080 | TFLOPs: 71.06 | +[default7]: iteration 2672/ 3814 | consumed samples: 1368064 | consumed tokens: 2801795072 | elapsed time per iteration (s): 5.45 | learning rate: 2.462E-05 | global batch size: 512 | lm loss: 4.949668E+00 | loss scale: 131072.0 | grad norm: 0.118 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.973 | TFLOPs: 70.98 | +[default7]: iteration 2673/ 3814 | consumed samples: 1368576 | consumed tokens: 2802843648 | elapsed time per iteration (s): 5.47 | learning rate: 2.458E-05 | global batch size: 512 | lm loss: 4.982453E+00 | loss scale: 131072.0 | grad norm: 0.103 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.607 | TFLOPs: 70.70 | +[default7]: iteration 2674/ 3814 | consumed samples: 1369088 | consumed tokens: 2803892224 | elapsed time per iteration (s): 5.46 | learning rate: 2.454E-05 | global batch size: 512 | lm loss: 4.954559E+00 | loss scale: 131072.0 | grad norm: 0.105 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.780 | TFLOPs: 70.83 | +[default7]: iteration 2675/ 3814 | consumed samples: 1369600 | consumed tokens: 2804940800 | elapsed time per iteration (s): 5.46 | learning rate: 2.450E-05 | global batch size: 512 | lm loss: 4.946769E+00 | loss scale: 131072.0 | grad norm: 0.106 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.855 | TFLOPs: 70.89 | +[default7]: iteration 2676/ 3814 | consumed samples: 1370112 | consumed tokens: 2805989376 | elapsed time per iteration (s): 5.47 | learning rate: 2.446E-05 | global batch size: 512 | lm loss: 4.963036E+00 | loss scale: 131072.0 | grad norm: 0.130 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.572 | TFLOPs: 70.68 | +[default7]: iteration 2677/ 3814 | consumed samples: 1370624 | consumed tokens: 2807037952 | elapsed time per iteration (s): 5.48 | learning rate: 2.443E-05 | global batch size: 512 | lm loss: 4.965816E+00 | loss scale: 131072.0 | grad norm: 0.116 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.395 | TFLOPs: 70.54 | +[default7]: iteration 2678/ 3814 | consumed samples: 1371136 | consumed tokens: 2808086528 | elapsed time per iteration (s): 5.43 | learning rate: 2.439E-05 | global batch size: 512 | lm loss: 4.981429E+00 | loss scale: 131072.0 | grad norm: 0.131 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.218 | TFLOPs: 71.16 | +[default7]: iteration 2679/ 3814 | consumed samples: 1371648 | consumed tokens: 2809135104 | elapsed time per iteration (s): 5.46 | learning rate: 2.435E-05 | global batch size: 512 | lm loss: 4.965153E+00 | loss scale: 131072.0 | grad norm: 0.133 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.742 | TFLOPs: 70.80 | +[default7]: iteration 2680/ 3814 | consumed samples: 1372160 | consumed tokens: 2810183680 | elapsed time per iteration (s): 5.46 | learning rate: 2.431E-05 | global batch size: 512 | lm loss: 4.964232E+00 | loss scale: 131072.0 | grad norm: 0.101 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.747 | TFLOPs: 70.81 | +[default7]: iteration 2681/ 3814 | consumed samples: 1372672 | consumed tokens: 2811232256 | elapsed time per iteration (s): 5.44 | learning rate: 2.427E-05 | global batch size: 512 | lm loss: 4.949439E+00 | loss scale: 131072.0 | grad norm: 0.140 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.159 | TFLOPs: 71.12 | +[default7]: iteration 2682/ 3814 | consumed samples: 1373184 | consumed tokens: 2812280832 | elapsed time per iteration (s): 5.44 | learning rate: 2.423E-05 | global batch size: 512 | lm loss: 4.980967E+00 | loss scale: 131072.0 | grad norm: 0.127 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.148 | TFLOPs: 71.11 | +[default7]: iteration 2683/ 3814 | consumed samples: 1373696 | consumed tokens: 2813329408 | elapsed time per iteration (s): 5.45 | learning rate: 2.419E-05 | global batch size: 512 | lm loss: 4.951550E+00 | loss scale: 131072.0 | grad norm: 0.098 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.975 | TFLOPs: 70.98 | +[default7]: iteration 2684/ 3814 | consumed samples: 1374208 | consumed tokens: 2814377984 | elapsed time per iteration (s): 5.42 | learning rate: 2.415E-05 | global batch size: 512 | lm loss: 4.929332E+00 | loss scale: 131072.0 | grad norm: 0.121 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.518 | TFLOPs: 71.39 | +[default7]: iteration 2685/ 3814 | consumed samples: 1374720 | consumed tokens: 2815426560 | elapsed time per iteration (s): 5.43 | learning rate: 2.411E-05 | global batch size: 512 | lm loss: 4.944350E+00 | loss scale: 131072.0 | grad norm: 0.136 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.308 | TFLOPs: 71.23 | +[default7]: iteration 2686/ 3814 | consumed samples: 1375232 | consumed tokens: 2816475136 | elapsed time per iteration (s): 5.42 | learning rate: 2.407E-05 | global batch size: 512 | lm loss: 4.963668E+00 | loss scale: 131072.0 | grad norm: 0.113 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.485 | TFLOPs: 71.37 | +[default7]: iteration 2687/ 3814 | consumed samples: 1375744 | consumed tokens: 2817523712 | elapsed time per iteration (s): 5.43 | learning rate: 2.404E-05 | global batch size: 512 | lm loss: 4.949393E+00 | loss scale: 131072.0 | grad norm: 0.120 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.310 | TFLOPs: 71.23 | +[default7]: iteration 2688/ 3814 | consumed samples: 1376256 | consumed tokens: 2818572288 | elapsed time per iteration (s): 5.43 | learning rate: 2.400E-05 | global batch size: 512 | lm loss: 4.945578E+00 | loss scale: 131072.0 | grad norm: 0.125 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.231 | TFLOPs: 71.17 | +[default7]: iteration 2689/ 3814 | consumed samples: 1376768 | consumed tokens: 2819620864 | elapsed time per iteration (s): 5.44 | learning rate: 2.396E-05 | global batch size: 512 | lm loss: 4.938399E+00 | loss scale: 131072.0 | grad norm: 0.111 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.173 | TFLOPs: 71.13 | +[default7]: iteration 2690/ 3814 | consumed samples: 1377280 | consumed tokens: 2820669440 | elapsed time per iteration (s): 5.44 | learning rate: 2.392E-05 | global batch size: 512 | lm loss: 4.956389E+00 | loss scale: 131072.0 | grad norm: 0.134 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.159 | TFLOPs: 71.12 | +[default7]: iteration 2691/ 3814 | consumed samples: 1377792 | consumed tokens: 2821718016 | elapsed time per iteration (s): 5.44 | learning rate: 2.388E-05 | global batch size: 512 | lm loss: 4.968380E+00 | loss scale: 131072.0 | grad norm: 0.138 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.163 | TFLOPs: 71.12 | +[default7]: iteration 2692/ 3814 | consumed samples: 1378304 | consumed tokens: 2822766592 | elapsed time per iteration (s): 5.42 | learning rate: 2.384E-05 | global batch size: 512 | lm loss: 4.956641E+00 | loss scale: 131072.0 | grad norm: 0.107 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.540 | TFLOPs: 71.41 | +[default7]: iteration 2693/ 3814 | consumed samples: 1378816 | consumed tokens: 2823815168 | elapsed time per iteration (s): 5.43 | learning rate: 2.380E-05 | global batch size: 512 | lm loss: 4.966115E+00 | loss scale: 131072.0 | grad norm: 0.139 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.321 | TFLOPs: 71.24 | +[default7]: iteration 2694/ 3814 | consumed samples: 1379328 | consumed tokens: 2824863744 | elapsed time per iteration (s): 5.42 | learning rate: 2.376E-05 | global batch size: 512 | lm loss: 4.980713E+00 | loss scale: 131072.0 | grad norm: 0.139 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.472 | TFLOPs: 71.36 | +[default7]: iteration 2695/ 3814 | consumed samples: 1379840 | consumed tokens: 2825912320 | elapsed time per iteration (s): 5.43 | learning rate: 2.373E-05 | global batch size: 512 | lm loss: 4.958176E+00 | loss scale: 131072.0 | grad norm: 0.129 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.254 | TFLOPs: 71.19 | +[default7]: iteration 2696/ 3814 | consumed samples: 1380352 | consumed tokens: 2826960896 | elapsed time per iteration (s): 5.47 | learning rate: 2.369E-05 | global batch size: 512 | lm loss: 4.984624E+00 | loss scale: 131072.0 | grad norm: 0.128 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.654 | TFLOPs: 70.74 | +[default7]: iteration 2697/ 3814 | consumed samples: 1380864 | consumed tokens: 2828009472 | elapsed time per iteration (s): 5.47 | learning rate: 2.365E-05 | global batch size: 512 | lm loss: 4.951585E+00 | loss scale: 131072.0 | grad norm: 0.153 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.617 | TFLOPs: 70.71 | +[default7]: iteration 2698/ 3814 | consumed samples: 1381376 | consumed tokens: 2829058048 | elapsed time per iteration (s): 5.42 | learning rate: 2.361E-05 | global batch size: 512 | lm loss: 4.994222E+00 | loss scale: 131072.0 | grad norm: 0.148 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.447 | TFLOPs: 71.34 | +[default7]: iteration 2699/ 3814 | consumed samples: 1381888 | consumed tokens: 2830106624 | elapsed time per iteration (s): 5.43 | learning rate: 2.357E-05 | global batch size: 512 | lm loss: 4.966857E+00 | loss scale: 131072.0 | grad norm: 0.103 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.273 | TFLOPs: 71.21 | +[default7]: iteration 2700/ 3814 | consumed samples: 1382400 | consumed tokens: 2831155200 | elapsed time per iteration (s): 5.45 | learning rate: 2.353E-05 | global batch size: 512 | lm loss: 4.957403E+00 | loss scale: 131072.0 | grad norm: 0.125 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.923 | TFLOPs: 70.94 | +[default7]: iteration 2701/ 3814 | consumed samples: 1382912 | consumed tokens: 2832203776 | elapsed time per iteration (s): 5.43 | learning rate: 2.349E-05 | global batch size: 512 | lm loss: 4.955206E+00 | loss scale: 131072.0 | grad norm: 0.129 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.270 | TFLOPs: 71.20 | +[default7]: iteration 2702/ 3814 | consumed samples: 1383424 | consumed tokens: 2833252352 | elapsed time per iteration (s): 5.41 | learning rate: 2.346E-05 | global batch size: 512 | lm loss: 4.952453E+00 | loss scale: 131072.0 | grad norm: 0.099 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.634 | TFLOPs: 71.48 | +[default7]: iteration 2703/ 3814 | consumed samples: 1383936 | consumed tokens: 2834300928 | elapsed time per iteration (s): 5.45 | learning rate: 2.342E-05 | global batch size: 512 | lm loss: 4.980826E+00 | loss scale: 131072.0 | grad norm: 0.124 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.878 | TFLOPs: 70.91 | +[default7]: iteration 2704/ 3814 | consumed samples: 1384448 | consumed tokens: 2835349504 | elapsed time per iteration (s): 5.44 | learning rate: 2.338E-05 | global batch size: 512 | lm loss: 4.950119E+00 | loss scale: 131072.0 | grad norm: 0.114 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.172 | TFLOPs: 71.13 | +[default7]: iteration 2705/ 3814 | consumed samples: 1384960 | consumed tokens: 2836398080 | elapsed time per iteration (s): 5.42 | learning rate: 2.334E-05 | global batch size: 512 | lm loss: 4.948427E+00 | loss scale: 131072.0 | grad norm: 0.111 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.537 | TFLOPs: 71.40 | +[default7]: iteration 2706/ 3814 | consumed samples: 1385472 | consumed tokens: 2837446656 | elapsed time per iteration (s): 5.44 | learning rate: 2.330E-05 | global batch size: 512 | lm loss: 4.977234E+00 | loss scale: 131072.0 | grad norm: 0.113 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.106 | TFLOPs: 71.08 | +[default7]: iteration 2707/ 3814 | consumed samples: 1385984 | consumed tokens: 2838495232 | elapsed time per iteration (s): 5.45 | learning rate: 2.326E-05 | global batch size: 512 | lm loss: 4.934554E+00 | loss scale: 131072.0 | grad norm: 0.109 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.898 | TFLOPs: 70.92 | +[default7]: iteration 2708/ 3814 | consumed samples: 1386496 | consumed tokens: 2839543808 | elapsed time per iteration (s): 5.43 | learning rate: 2.323E-05 | global batch size: 512 | lm loss: 4.945499E+00 | loss scale: 131072.0 | grad norm: 0.122 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.305 | TFLOPs: 71.23 | +[default7]: iteration 2709/ 3814 | consumed samples: 1387008 | consumed tokens: 2840592384 | elapsed time per iteration (s): 5.42 | learning rate: 2.319E-05 | global batch size: 512 | lm loss: 4.978083E+00 | loss scale: 131072.0 | grad norm: 0.102 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.520 | TFLOPs: 71.39 | +[default7]: iteration 2710/ 3814 | consumed samples: 1387520 | consumed tokens: 2841640960 | elapsed time per iteration (s): 5.41 | learning rate: 2.315E-05 | global batch size: 512 | lm loss: 4.948852E+00 | loss scale: 131072.0 | grad norm: 0.114 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.666 | TFLOPs: 71.50 | +[default7]: iteration 2711/ 3814 | consumed samples: 1388032 | consumed tokens: 2842689536 | elapsed time per iteration (s): 5.44 | learning rate: 2.311E-05 | global batch size: 512 | lm loss: 4.953626E+00 | loss scale: 131072.0 | grad norm: 0.138 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.128 | TFLOPs: 71.10 | +[default7]: iteration 2712/ 3814 | consumed samples: 1388544 | consumed tokens: 2843738112 | elapsed time per iteration (s): 5.42 | learning rate: 2.307E-05 | global batch size: 512 | lm loss: 4.969302E+00 | loss scale: 131072.0 | grad norm: 0.166 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.422 | TFLOPs: 71.32 | +[default7]: iteration 2713/ 3814 | consumed samples: 1389056 | consumed tokens: 2844786688 | elapsed time per iteration (s): 5.41 | learning rate: 2.303E-05 | global batch size: 512 | lm loss: 4.969666E+00 | loss scale: 131072.0 | grad norm: 0.159 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.614 | TFLOPs: 71.46 | +[default7]: iteration 2714/ 3814 | consumed samples: 1389568 | consumed tokens: 2845835264 | elapsed time per iteration (s): 5.41 | learning rate: 2.300E-05 | global batch size: 512 | lm loss: 4.996595E+00 | loss scale: 131072.0 | grad norm: 0.107 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.619 | TFLOPs: 71.47 | +[default7]: iteration 2715/ 3814 | consumed samples: 1390080 | consumed tokens: 2846883840 | elapsed time per iteration (s): 5.44 | learning rate: 2.296E-05 | global batch size: 512 | lm loss: 4.962106E+00 | loss scale: 131072.0 | grad norm: 0.136 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.067 | TFLOPs: 71.05 | +[default7]: iteration 2716/ 3814 | consumed samples: 1390592 | consumed tokens: 2847932416 | elapsed time per iteration (s): 5.45 | learning rate: 2.292E-05 | global batch size: 512 | lm loss: 4.960721E+00 | loss scale: 131072.0 | grad norm: 0.120 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.862 | TFLOPs: 70.89 | +[default7]: iteration 2717/ 3814 | consumed samples: 1391104 | consumed tokens: 2848980992 | elapsed time per iteration (s): 5.45 | learning rate: 2.288E-05 | global batch size: 512 | lm loss: 4.913967E+00 | loss scale: 131072.0 | grad norm: 0.107 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.913 | TFLOPs: 70.93 | +[default7]: iteration 2718/ 3814 | consumed samples: 1391616 | consumed tokens: 2850029568 | elapsed time per iteration (s): 5.42 | learning rate: 2.284E-05 | global batch size: 512 | lm loss: 4.951831E+00 | loss scale: 131072.0 | grad norm: 0.149 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.383 | TFLOPs: 71.29 | +[default7]: iteration 2719/ 3814 | consumed samples: 1392128 | consumed tokens: 2851078144 | elapsed time per iteration (s): 5.45 | learning rate: 2.280E-05 | global batch size: 512 | lm loss: 4.954688E+00 | loss scale: 131072.0 | grad norm: 0.115 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.959 | TFLOPs: 70.97 | +[default7]: iteration 2720/ 3814 | consumed samples: 1392640 | consumed tokens: 2852126720 | elapsed time per iteration (s): 5.47 | learning rate: 2.277E-05 | global batch size: 512 | lm loss: 4.965108E+00 | loss scale: 131072.0 | grad norm: 0.124 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.608 | TFLOPs: 70.70 | +[default7]: iteration 2721/ 3814 | consumed samples: 1393152 | consumed tokens: 2853175296 | elapsed time per iteration (s): 5.46 | learning rate: 2.273E-05 | global batch size: 512 | lm loss: 4.959946E+00 | loss scale: 131072.0 | grad norm: 0.137 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.773 | TFLOPs: 70.83 | +[default7]: iteration 2722/ 3814 | consumed samples: 1393664 | consumed tokens: 2854223872 | elapsed time per iteration (s): 5.47 | learning rate: 2.269E-05 | global batch size: 512 | lm loss: 4.951540E+00 | loss scale: 131072.0 | grad norm: 0.115 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.585 | TFLOPs: 70.69 | +[default7]: iteration 2723/ 3814 | consumed samples: 1394176 | consumed tokens: 2855272448 | elapsed time per iteration (s): 5.49 | learning rate: 2.265E-05 | global batch size: 512 | lm loss: 4.966934E+00 | loss scale: 131072.0 | grad norm: 0.111 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.328 | TFLOPs: 70.49 | +[default7]: iteration 2724/ 3814 | consumed samples: 1394688 | consumed tokens: 2856321024 | elapsed time per iteration (s): 5.49 | learning rate: 2.261E-05 | global batch size: 512 | lm loss: 4.932396E+00 | loss scale: 131072.0 | grad norm: 0.126 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.198 | TFLOPs: 70.39 | +[default7]: iteration 2725/ 3814 | consumed samples: 1395200 | consumed tokens: 2857369600 | elapsed time per iteration (s): 5.48 | learning rate: 2.258E-05 | global batch size: 512 | lm loss: 4.931710E+00 | loss scale: 131072.0 | grad norm: 0.104 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.413 | TFLOPs: 70.56 | +[default7]: iteration 2726/ 3814 | consumed samples: 1395712 | consumed tokens: 2858418176 | elapsed time per iteration (s): 5.45 | learning rate: 2.254E-05 | global batch size: 512 | lm loss: 4.961280E+00 | loss scale: 131072.0 | grad norm: 0.132 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.960 | TFLOPs: 70.97 | +[default7]: iteration 2727/ 3814 | consumed samples: 1396224 | consumed tokens: 2859466752 | elapsed time per iteration (s): 5.44 | learning rate: 2.250E-05 | global batch size: 512 | lm loss: 4.961224E+00 | loss scale: 131072.0 | grad norm: 0.109 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.173 | TFLOPs: 71.13 | +[default7]: iteration 2728/ 3814 | consumed samples: 1396736 | consumed tokens: 2860515328 | elapsed time per iteration (s): 5.45 | learning rate: 2.246E-05 | global batch size: 512 | lm loss: 4.945313E+00 | loss scale: 131072.0 | grad norm: 0.122 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.925 | TFLOPs: 70.94 | +[default7]: iteration 2729/ 3814 | consumed samples: 1397248 | consumed tokens: 2861563904 | elapsed time per iteration (s): 5.43 | learning rate: 2.242E-05 | global batch size: 512 | lm loss: 4.956815E+00 | loss scale: 131072.0 | grad norm: 0.118 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.263 | TFLOPs: 71.20 | +[default7]: iteration 2730/ 3814 | consumed samples: 1397760 | consumed tokens: 2862612480 | elapsed time per iteration (s): 5.46 | learning rate: 2.239E-05 | global batch size: 512 | lm loss: 4.957926E+00 | loss scale: 131072.0 | grad norm: 0.145 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.818 | TFLOPs: 70.86 | +[default7]: iteration 2731/ 3814 | consumed samples: 1398272 | consumed tokens: 2863661056 | elapsed time per iteration (s): 5.45 | learning rate: 2.235E-05 | global batch size: 512 | lm loss: 4.961824E+00 | loss scale: 131072.0 | grad norm: 0.142 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.890 | TFLOPs: 70.92 | +[default7]: iteration 2732/ 3814 | consumed samples: 1398784 | consumed tokens: 2864709632 | elapsed time per iteration (s): 5.45 | learning rate: 2.231E-05 | global batch size: 512 | lm loss: 4.940875E+00 | loss scale: 131072.0 | grad norm: 0.114 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.948 | TFLOPs: 70.96 | +[default7]: iteration 2733/ 3814 | consumed samples: 1399296 | consumed tokens: 2865758208 | elapsed time per iteration (s): 5.45 | learning rate: 2.227E-05 | global batch size: 512 | lm loss: 4.944462E+00 | loss scale: 131072.0 | grad norm: 0.145 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.951 | TFLOPs: 70.96 | +[default7]: iteration 2734/ 3814 | consumed samples: 1399808 | consumed tokens: 2866806784 | elapsed time per iteration (s): 5.44 | learning rate: 2.224E-05 | global batch size: 512 | lm loss: 4.939998E+00 | loss scale: 131072.0 | grad norm: 0.109 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.039 | TFLOPs: 71.03 | +[default7]: iteration 2735/ 3814 | consumed samples: 1400320 | consumed tokens: 2867855360 | elapsed time per iteration (s): 5.44 | learning rate: 2.220E-05 | global batch size: 512 | lm loss: 4.932487E+00 | loss scale: 131072.0 | grad norm: 0.115 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.174 | TFLOPs: 71.13 | +[default7]: iteration 2736/ 3814 | consumed samples: 1400832 | consumed tokens: 2868903936 | elapsed time per iteration (s): 5.45 | learning rate: 2.216E-05 | global batch size: 512 | lm loss: 4.963720E+00 | loss scale: 131072.0 | grad norm: 0.114 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.016 | TFLOPs: 71.01 | +[default7]: iteration 2737/ 3814 | consumed samples: 1401344 | consumed tokens: 2869952512 | elapsed time per iteration (s): 5.44 | learning rate: 2.212E-05 | global batch size: 512 | lm loss: 4.938441E+00 | loss scale: 131072.0 | grad norm: 0.120 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.136 | TFLOPs: 71.10 | +[default7]: iteration 2738/ 3814 | consumed samples: 1401856 | consumed tokens: 2871001088 | elapsed time per iteration (s): 5.44 | learning rate: 2.208E-05 | global batch size: 512 | lm loss: 4.933309E+00 | loss scale: 131072.0 | grad norm: 0.128 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.033 | TFLOPs: 71.02 | +[default7]: iteration 2739/ 3814 | consumed samples: 1402368 | consumed tokens: 2872049664 | elapsed time per iteration (s): 5.43 | learning rate: 2.205E-05 | global batch size: 512 | lm loss: 4.972419E+00 | loss scale: 131072.0 | grad norm: 0.124 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.372 | TFLOPs: 71.28 | +[default7]: iteration 2740/ 3814 | consumed samples: 1402880 | consumed tokens: 2873098240 | elapsed time per iteration (s): 5.43 | learning rate: 2.201E-05 | global batch size: 512 | lm loss: 4.960895E+00 | loss scale: 131072.0 | grad norm: 0.121 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.247 | TFLOPs: 71.19 | +[default7]: iteration 2741/ 3814 | consumed samples: 1403392 | consumed tokens: 2874146816 | elapsed time per iteration (s): 5.41 | learning rate: 2.197E-05 | global batch size: 512 | lm loss: 4.943002E+00 | loss scale: 131072.0 | grad norm: 0.102 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.619 | TFLOPs: 71.47 | +[default7]: iteration 2742/ 3814 | consumed samples: 1403904 | consumed tokens: 2875195392 | elapsed time per iteration (s): 5.39 | learning rate: 2.193E-05 | global batch size: 512 | lm loss: 4.934681E+00 | loss scale: 131072.0 | grad norm: 0.099 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 95.052 | TFLOPs: 71.79 | +[default7]: iteration 2743/ 3814 | consumed samples: 1404416 | consumed tokens: 2876243968 | elapsed time per iteration (s): 5.45 | learning rate: 2.190E-05 | global batch size: 512 | lm loss: 4.903746E+00 | loss scale: 131072.0 | grad norm: 0.116 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.008 | TFLOPs: 71.01 | +[default7]: iteration 2744/ 3814 | consumed samples: 1404928 | consumed tokens: 2877292544 | elapsed time per iteration (s): 5.42 | learning rate: 2.186E-05 | global batch size: 512 | lm loss: 4.919789E+00 | loss scale: 131072.0 | grad norm: 0.120 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.522 | TFLOPs: 71.39 | +[default7]: iteration 2745/ 3814 | consumed samples: 1405440 | consumed tokens: 2878341120 | elapsed time per iteration (s): 5.43 | learning rate: 2.182E-05 | global batch size: 512 | lm loss: 4.953541E+00 | loss scale: 131072.0 | grad norm: 0.107 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.370 | TFLOPs: 71.28 | +[default7]: iteration 2746/ 3814 | consumed samples: 1405952 | consumed tokens: 2879389696 | elapsed time per iteration (s): 5.39 | learning rate: 2.178E-05 | global batch size: 512 | lm loss: 4.959776E+00 | loss scale: 131072.0 | grad norm: 0.135 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.903 | TFLOPs: 71.68 | +[default7]: iteration 2747/ 3814 | consumed samples: 1406464 | consumed tokens: 2880438272 | elapsed time per iteration (s): 5.48 | learning rate: 2.175E-05 | global batch size: 512 | lm loss: 4.965426E+00 | loss scale: 131072.0 | grad norm: 0.112 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.476 | TFLOPs: 70.60 | +[default7]: iteration 2748/ 3814 | consumed samples: 1406976 | consumed tokens: 2881486848 | elapsed time per iteration (s): 5.45 | learning rate: 2.171E-05 | global batch size: 512 | lm loss: 4.966245E+00 | loss scale: 131072.0 | grad norm: 0.115 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.973 | TFLOPs: 70.98 | +[default7]: iteration 2749/ 3814 | consumed samples: 1407488 | consumed tokens: 2882535424 | elapsed time per iteration (s): 5.43 | learning rate: 2.167E-05 | global batch size: 512 | lm loss: 4.946638E+00 | loss scale: 131072.0 | grad norm: 0.127 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.239 | TFLOPs: 71.18 | +[default1]:[2023-02-16 16:42:29,457] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2750/layer_01-model_01-model_states.pt... +[default0]:saving checkpoint at iteration 2750 to /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main +[default0]:[2023-02-16 16:42:29,454] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step2750 is begin to save! +[default0]:[2023-02-16 16:42:29,457] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2750/layer_01-model_00-model_states.pt... +[default7]: iteration 2750/ 3814 | consumed samples: 1408000 | consumed tokens: 2883584000 | elapsed time per iteration (s): 5.44 | learning rate: 2.163E-05 | global batch size: 512 | lm loss: 4.931714E+00 | loss scale: 131072.0 | grad norm: 0.125 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.074 | TFLOPs: 71.06 | +[default1]:[2023-02-16 16:42:29,621] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2750/layer_01-model_01-model_states.pt. +[default1]:[2023-02-16 16:42:29,621] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2750/layer_04-model_01-model_states.pt... +[default0]:[2023-02-16 16:42:29,621] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2750/layer_01-model_00-model_states.pt. +[default0]:[2023-02-16 16:42:29,622] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2750/layer_04-model_00-model_states.pt... +[default1]:[2023-02-16 16:42:29,706] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2750/layer_04-model_01-model_states.pt. +[default1]:[2023-02-16 16:42:29,707] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2750/layer_05-model_01-model_states.pt... +[default1]:[2023-02-16 16:42:29,763] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2750/layer_05-model_01-model_states.pt. +[default1]:[2023-02-16 16:42:29,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2750/layer_06-model_01-model_states.pt... +[default0]:[2023-02-16 16:42:29,709] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2750/layer_04-model_00-model_states.pt. +[default0]:[2023-02-16 16:42:29,710] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2750/layer_05-model_00-model_states.pt... +[default0]:[2023-02-16 16:42:29,768] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2750/layer_05-model_00-model_states.pt. +[default0]:[2023-02-16 16:42:29,768] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2750/layer_06-model_00-model_states.pt... +[default1]:[2023-02-16 16:42:29,818] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2750/layer_06-model_01-model_states.pt. +[default1]:[2023-02-16 16:42:29,819] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2750/layer_07-model_01-model_states.pt... +[default1]:[2023-02-16 16:42:29,876] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2750/layer_07-model_01-model_states.pt. +[default1]:[2023-02-16 16:42:29,876] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2750/layer_08-model_01-model_states.pt... +[default0]:[2023-02-16 16:42:29,828] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2750/layer_06-model_00-model_states.pt. +[default0]:[2023-02-16 16:42:29,828] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2750/layer_07-model_00-model_states.pt... +[default0]:[2023-02-16 16:42:29,886] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2750/layer_07-model_00-model_states.pt. +[default0]:[2023-02-16 16:42:29,886] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2750/layer_08-model_00-model_states.pt... +[default1]:[2023-02-16 16:42:29,935] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2750/layer_08-model_01-model_states.pt. +[default1]:[2023-02-16 16:42:29,936] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2750/layer_09-model_01-model_states.pt... +[default0]:[2023-02-16 16:42:29,945] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2750/layer_08-model_00-model_states.pt. +[default0]:[2023-02-16 16:42:29,945] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2750/layer_09-model_00-model_states.pt... +[default1]:[2023-02-16 16:42:29,995] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2750/layer_09-model_01-model_states.pt. +[default1]:[2023-02-16 16:42:29,995] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2750/layer_10-model_01-model_states.pt... +[default1]:[2023-02-16 16:42:30,055] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2750/layer_10-model_01-model_states.pt. +[default1]:[2023-02-16 16:42:30,055] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2750/layer_11-model_01-model_states.pt... +[default0]:[2023-02-16 16:42:30,005] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2750/layer_09-model_00-model_states.pt. +[default0]:[2023-02-16 16:42:30,006] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2750/layer_10-model_00-model_states.pt... +[default0]:[2023-02-16 16:42:30,064] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2750/layer_10-model_00-model_states.pt. +[default0]:[2023-02-16 16:42:30,064] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2750/layer_11-model_00-model_states.pt... +[default1]:[2023-02-16 16:42:30,112] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2750/layer_11-model_01-model_states.pt. +[default1]:[2023-02-16 16:42:30,112] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2750/layer_12-model_01-model_states.pt... +[default1]:[2023-02-16 16:42:30,171] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2750/layer_12-model_01-model_states.pt. +[default1]:[2023-02-16 16:42:30,171] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2750/layer_13-model_01-model_states.pt... +[default0]:[2023-02-16 16:42:30,121] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2750/layer_11-model_00-model_states.pt. +[default0]:[2023-02-16 16:42:30,121] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2750/layer_12-model_00-model_states.pt... +[default0]:[2023-02-16 16:42:30,182] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2750/layer_12-model_00-model_states.pt. +[default0]:[2023-02-16 16:42:30,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2750/layer_13-model_00-model_states.pt... +[default1]:[2023-02-16 16:42:30,231] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2750/layer_13-model_01-model_states.pt. +[default1]:[2023-02-16 16:42:30,231] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2750/layer_14-model_01-model_states.pt... +[default0]:[2023-02-16 16:42:30,238] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2750/layer_13-model_00-model_states.pt. +[default0]:[2023-02-16 16:42:30,239] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2750/layer_14-model_00-model_states.pt... +[default1]:[2023-02-16 16:42:30,288] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2750/layer_14-model_01-model_states.pt. +[default1]:[2023-02-16 16:42:30,288] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2750/layer_15-model_01-model_states.pt... +[default1]:[2023-02-16 16:42:30,345] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2750/layer_15-model_01-model_states.pt. +[default1]:[2023-02-16 16:42:30,345] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2750/layer_16-model_01-model_states.pt... +[default0]:[2023-02-16 16:42:30,294] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2750/layer_14-model_00-model_states.pt. +[default0]:[2023-02-16 16:42:30,295] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2750/layer_15-model_00-model_states.pt... +[default0]:[2023-02-16 16:42:30,351] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2750/layer_15-model_00-model_states.pt. +[default0]:[2023-02-16 16:42:30,351] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2750/layer_16-model_00-model_states.pt... +[default1]:[2023-02-16 16:42:30,400] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2750/layer_16-model_01-model_states.pt. +[default1]:[2023-02-16 16:42:30,401] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2750/layer_17-model_01-model_states.pt... +[default1]:[2023-02-16 16:42:30,458] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2750/layer_17-model_01-model_states.pt. +[default1]:[2023-02-16 16:42:30,459] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2750/layer_18-model_01-model_states.pt... +[default0]:[2023-02-16 16:42:30,408] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2750/layer_16-model_00-model_states.pt. +[default0]:[2023-02-16 16:42:30,408] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2750/layer_17-model_00-model_states.pt... +[default0]:[2023-02-16 16:42:30,464] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2750/layer_17-model_00-model_states.pt. +[default0]:[2023-02-16 16:42:30,464] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2750/layer_18-model_00-model_states.pt... +[default1]:[2023-02-16 16:42:30,513] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2750/layer_18-model_01-model_states.pt. +[default1]:[2023-02-16 16:42:30,514] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2750/layer_19-model_01-model_states.pt... +[default1]:[2023-02-16 16:42:30,569] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2750/layer_19-model_01-model_states.pt. +[default1]:[2023-02-16 16:42:30,570] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2750/layer_20-model_01-model_states.pt... +[default0]:[2023-02-16 16:42:30,522] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2750/layer_18-model_00-model_states.pt. +[default0]:[2023-02-16 16:42:30,523] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2750/layer_19-model_00-model_states.pt... +[default0]:[2023-02-16 16:42:30,579] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2750/layer_19-model_00-model_states.pt. +[default0]:[2023-02-16 16:42:30,580] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2750/layer_20-model_00-model_states.pt... +[default1]:[2023-02-16 16:42:30,626] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2750/layer_20-model_01-model_states.pt. +[default1]:[2023-02-16 16:42:30,626] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2750/layer_21-model_01-model_states.pt... +[default0]:[2023-02-16 16:42:30,636] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2750/layer_20-model_00-model_states.pt. +[default0]:[2023-02-16 16:42:30,637] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2750/layer_21-model_00-model_states.pt... +[default1]:[2023-02-16 16:42:30,685] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2750/layer_21-model_01-model_states.pt. +[default1]:[2023-02-16 16:42:30,686] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2750/layer_22-model_01-model_states.pt... +[default1]:[2023-02-16 16:42:30,742] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2750/layer_22-model_01-model_states.pt. +[default1]:[2023-02-16 16:42:30,742] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2750/layer_23-model_01-model_states.pt... +[default0]:[2023-02-16 16:42:30,696] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2750/layer_21-model_00-model_states.pt. +[default0]:[2023-02-16 16:42:30,697] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2750/layer_22-model_00-model_states.pt... +[default0]:[2023-02-16 16:42:30,753] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2750/layer_22-model_00-model_states.pt. +[default0]:[2023-02-16 16:42:30,753] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2750/layer_23-model_00-model_states.pt... +[default1]:[2023-02-16 16:42:30,801] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2750/layer_23-model_01-model_states.pt. +[default1]:[2023-02-16 16:42:30,802] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2750/layer_24-model_01-model_states.pt... +[default1]:[2023-02-16 16:42:30,860] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2750/layer_24-model_01-model_states.pt. +[default1]:[2023-02-16 16:42:30,861] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2750/layer_25-model_01-model_states.pt... +[default0]:[2023-02-16 16:42:30,811] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2750/layer_23-model_00-model_states.pt. +[default0]:[2023-02-16 16:42:30,811] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2750/layer_24-model_00-model_states.pt... +[default0]:[2023-02-16 16:42:30,869] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2750/layer_24-model_00-model_states.pt. +[default0]:[2023-02-16 16:42:30,870] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2750/layer_25-model_00-model_states.pt... +[default1]:[2023-02-16 16:42:30,918] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2750/layer_25-model_01-model_states.pt. +[default1]:[2023-02-16 16:42:30,919] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2750/layer_26-model_01-model_states.pt... +[default1]:[2023-02-16 16:42:30,974] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2750/layer_26-model_01-model_states.pt. +[default1]:[2023-02-16 16:42:30,974] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2750/layer_27-model_01-model_states.pt... +[default0]:[2023-02-16 16:42:30,927] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2750/layer_25-model_00-model_states.pt. +[default0]:[2023-02-16 16:42:30,928] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2750/layer_26-model_00-model_states.pt... +[default0]:[2023-02-16 16:42:30,983] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2750/layer_26-model_00-model_states.pt. +[default0]:[2023-02-16 16:42:30,984] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2750/layer_27-model_00-model_states.pt... +[default1]:[2023-02-16 16:42:31,030] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2750/layer_27-model_01-model_states.pt. +[default1]:[2023-02-16 16:42:31,031] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2750/layer_28-model_01-model_states.pt... +[default0]:[2023-02-16 16:42:31,039] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2750/layer_27-model_00-model_states.pt. +[default0]:[2023-02-16 16:42:31,039] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2750/layer_28-model_00-model_states.pt... +[default1]:[2023-02-16 16:42:31,086] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2750/layer_28-model_01-model_states.pt. +[default1]:[2023-02-16 16:42:31,087] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2750/layer_29-model_01-model_states.pt... +[default1]:[2023-02-16 16:42:31,144] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2750/layer_29-model_01-model_states.pt. +[default1]:[2023-02-16 16:42:31,144] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2750/layer_30-model_01-model_states.pt... +[default0]:[2023-02-16 16:42:31,095] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2750/layer_28-model_00-model_states.pt. +[default0]:[2023-02-16 16:42:31,095] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2750/layer_29-model_00-model_states.pt... +[default0]:[2023-02-16 16:42:31,152] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2750/layer_29-model_00-model_states.pt. +[default0]:[2023-02-16 16:42:31,153] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2750/layer_30-model_00-model_states.pt... +[default1]:[2023-02-16 16:42:31,201] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2750/layer_30-model_01-model_states.pt. +[default1]:[2023-02-16 16:42:31,202] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2750/layer_31-model_01-model_states.pt... +[default1]:[2023-02-16 16:42:31,267] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2750/layer_31-model_01-model_states.pt. +[default1]:[2023-02-16 16:42:31,268] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2750/layer_32-model_01-model_states.pt... +[default0]:[2023-02-16 16:42:31,216] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2750/layer_30-model_00-model_states.pt. +[default0]:[2023-02-16 16:42:31,217] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2750/layer_31-model_00-model_states.pt... +[default0]:[2023-02-16 16:42:31,284] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2750/layer_31-model_00-model_states.pt. +[default0]:[2023-02-16 16:42:31,284] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2750/layer_32-model_00-model_states.pt... +[default1]:[2023-02-16 16:42:31,326] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2750/layer_32-model_01-model_states.pt. +[default1]:[2023-02-16 16:42:31,327] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2750/layer_33-model_01-model_states.pt... +[default0]:[2023-02-16 16:42:31,352] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2750/layer_32-model_00-model_states.pt. +[default0]:[2023-02-16 16:42:31,353] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2750/layer_33-model_00-model_states.pt... +[default1]:[2023-02-16 16:42:31,389] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2750/layer_33-model_01-model_states.pt. +[default1]:[2023-02-16 16:42:31,390] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2750/layer_34-model_01-model_states.pt... +[default1]:[2023-02-16 16:42:31,449] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2750/layer_34-model_01-model_states.pt. +[default1]:[2023-02-16 16:42:31,450] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2750/layer_35-model_01-model_states.pt... +[default0]:[2023-02-16 16:42:31,412] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2750/layer_33-model_00-model_states.pt. +[default0]:[2023-02-16 16:42:31,412] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2750/layer_34-model_00-model_states.pt... +[default0]:[2023-02-16 16:42:31,470] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2750/layer_34-model_00-model_states.pt. +[default0]:[2023-02-16 16:42:31,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2750/layer_35-model_00-model_states.pt... +[default7]:[2023-02-16 16:42:31,561] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2750/zero_pp_rank_11_mp_rank_01_optim_states.pt... +[default5]:[2023-02-16 16:42:31,561] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2750/zero_pp_rank_22_mp_rank_01_optim_states.pt... +[default6]:[2023-02-16 16:42:31,561] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2750/zero_pp_rank_15_mp_rank_00_optim_states.pt... +[default0]:[2023-02-16 16:42:31,561] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2750/zero_pp_rank_4_mp_rank_00_optim_states.pt... +[default6]:[2023-02-16 16:42:31,561] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2750/zero_pp_rank_11_mp_rank_00_optim_states.pt... +[default5]:[2023-02-16 16:42:31,561] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2750/zero_pp_rank_6_mp_rank_01_optim_states.pt... +[default4]:[2023-02-16 16:42:31,561] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2750/zero_pp_rank_18_mp_rank_00_optim_states.pt... +[default2]:[2023-02-16 16:42:31,561] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2750/zero_pp_rank_25_mp_rank_00_optim_states.pt... +[default2]:[2023-02-16 16:42:31,561] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2750/zero_pp_rank_5_mp_rank_00_optim_states.pt... +[default4]:[2023-02-16 16:42:31,561] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2750/zero_pp_rank_6_mp_rank_00_optim_states.pt... +[default5]:[2023-02-16 16:42:31,561] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2750/zero_pp_rank_30_mp_rank_01_optim_states.pt... +[default1]:[2023-02-16 16:42:31,561] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2750/zero_pp_rank_4_mp_rank_01_optim_states.pt... +[default5]:[2023-02-16 16:42:31,561] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2750/zero_pp_rank_10_mp_rank_01_optim_states.pt... +[default2]:[2023-02-16 16:42:31,561] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2750/zero_pp_rank_29_mp_rank_00_optim_states.pt... +[default1]:[2023-02-16 16:42:31,561] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2750/zero_pp_rank_16_mp_rank_01_optim_states.pt... +[default4]:[2023-02-16 16:42:31,561] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2750/zero_pp_rank_10_mp_rank_00_optim_states.pt... +[default7]:[2023-02-16 16:42:31,561] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2750/zero_pp_rank_27_mp_rank_01_optim_states.pt... +[default7]:[2023-02-16 16:42:31,561] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2750/zero_pp_rank_23_mp_rank_01_optim_states.pt... +[default0]:[2023-02-16 16:42:31,561] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2750/zero_pp_rank_28_mp_rank_00_optim_states.pt... +[default1]:[2023-02-16 16:42:31,510] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2750/layer_35-model_01-model_states.pt. +[default1]:[2023-02-16 16:42:31,510] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2750/layer_37-model_01-model_states.pt... +[default1]:[2023-02-16 16:42:31,511] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2750/layer_37-model_01-model_states.pt. +[default1]:[2023-02-16 16:42:31,512] [INFO] [logging.py:68:log_dist] [Rank 1] Saving model checkpoint: /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2750/mp_rank_01_model_states.pt +[default1]:[2023-02-16 16:42:31,512] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2750/mp_rank_01_model_states.pt... +[default1]:[2023-02-16 16:42:31,514] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2750/mp_rank_01_model_states.pt. +[default1]:[2023-02-16 16:42:31,561] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2750/zero_pp_rank_0_mp_rank_01_optim_states.pt... +[default4]:[2023-02-16 16:42:31,561] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2750/zero_pp_rank_22_mp_rank_00_optim_states.pt... +[default2]:[2023-02-16 16:42:31,561] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2750/zero_pp_rank_21_mp_rank_00_optim_states.pt... +[default5]:[2023-02-16 16:42:31,561] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2750/zero_pp_rank_14_mp_rank_01_optim_states.pt... +[default3]:[2023-02-16 16:42:31,561] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2750/zero_pp_rank_9_mp_rank_01_optim_states.pt... +[default7]:[2023-02-16 16:42:31,561] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2750/zero_pp_rank_7_mp_rank_01_optim_states.pt... +[default0]:[2023-02-16 16:42:31,526] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2750/layer_35-model_00-model_states.pt. +[default0]:[2023-02-16 16:42:31,526] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2750/layer_37-model_00-model_states.pt... +[default0]:[2023-02-16 16:42:31,527] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2750/layer_37-model_00-model_states.pt. +[default0]:[2023-02-16 16:42:31,528] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2750/mp_rank_00_model_states.pt +[default0]:[2023-02-16 16:42:31,528] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2750/mp_rank_00_model_states.pt... +[default0]:[2023-02-16 16:42:31,530] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2750/mp_rank_00_model_states.pt. +[default0]:[2023-02-16 16:42:31,561] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2750/zero_pp_rank_0_mp_rank_00_optim_states.pt... +[default3]:[2023-02-16 16:42:31,561] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2750/zero_pp_rank_17_mp_rank_01_optim_states.pt... +[default6]:[2023-02-16 16:42:31,561] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2750/zero_pp_rank_23_mp_rank_00_optim_states.pt... +[default7]:[2023-02-16 16:42:31,561] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2750/zero_pp_rank_31_mp_rank_01_optim_states.pt... +[default1]:[2023-02-16 16:42:31,561] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2750/zero_pp_rank_28_mp_rank_01_optim_states.pt... +[default3]:[2023-02-16 16:42:31,561] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2750/zero_pp_rank_5_mp_rank_01_optim_states.pt... +[default0]:[2023-02-16 16:42:31,561] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2750/zero_pp_rank_12_mp_rank_00_optim_states.pt... +[default6]:[2023-02-16 16:42:31,561] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2750/zero_pp_rank_7_mp_rank_00_optim_states.pt... +[default1]:[2023-02-16 16:42:31,561] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2750/zero_pp_rank_8_mp_rank_01_optim_states.pt... +[default5]:[2023-02-16 16:42:31,561] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2750/zero_pp_rank_26_mp_rank_01_optim_states.pt... +[default1]:[2023-02-16 16:42:31,561] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2750/zero_pp_rank_20_mp_rank_01_optim_states.pt... +[default3]:[2023-02-16 16:42:31,561] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2750/zero_pp_rank_21_mp_rank_01_optim_states.pt... +[default1]:[2023-02-16 16:42:31,561] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2750/zero_pp_rank_24_mp_rank_01_optim_states.pt... +[default3]:[2023-02-16 16:42:31,561] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2750/zero_pp_rank_29_mp_rank_01_optim_states.pt... +[default5]:[2023-02-16 16:42:31,561] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2750/zero_pp_rank_18_mp_rank_01_optim_states.pt... +[default4]:[2023-02-16 16:42:31,561] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2750/zero_pp_rank_30_mp_rank_00_optim_states.pt... +[default0]:[2023-02-16 16:42:31,561] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2750/zero_pp_rank_16_mp_rank_00_optim_states.pt... +[default3]:[2023-02-16 16:42:31,561] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2750/zero_pp_rank_25_mp_rank_01_optim_states.pt... +[default0]:[2023-02-16 16:42:31,561] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2750/zero_pp_rank_20_mp_rank_00_optim_states.pt... +[default4]:[2023-02-16 16:42:31,561] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2750/zero_pp_rank_2_mp_rank_00_optim_states.pt... +[default5]:[2023-02-16 16:42:31,561] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2750/zero_pp_rank_2_mp_rank_01_optim_states.pt... +[default6]:[2023-02-16 16:42:31,561] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2750/zero_pp_rank_31_mp_rank_00_optim_states.pt... +[default0]:[2023-02-16 16:42:31,561] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2750/zero_pp_rank_24_mp_rank_00_optim_states.pt... +[default2]:[2023-02-16 16:42:31,561] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2750/zero_pp_rank_1_mp_rank_00_optim_states.pt... +[default7]:[2023-02-16 16:42:31,561] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2750/zero_pp_rank_3_mp_rank_01_optim_states.pt... +[default0]:[2023-02-16 16:42:31,561] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2750/zero_pp_rank_8_mp_rank_00_optim_states.pt... +[default7]:[2023-02-16 16:42:31,561] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2750/zero_pp_rank_19_mp_rank_01_optim_states.pt... +[default6]:[2023-02-16 16:42:31,561] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2750/zero_pp_rank_27_mp_rank_00_optim_states.pt... +[default2]:[2023-02-16 16:42:31,561] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2750/zero_pp_rank_13_mp_rank_00_optim_states.pt... +[default4]:[2023-02-16 16:42:31,561] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2750/zero_pp_rank_26_mp_rank_00_optim_states.pt... +[default3]:[2023-02-16 16:42:31,561] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2750/zero_pp_rank_1_mp_rank_01_optim_states.pt... +[default2]:[2023-02-16 16:42:31,561] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2750/zero_pp_rank_17_mp_rank_00_optim_states.pt... +[default1]:[2023-02-16 16:42:31,561] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2750/zero_pp_rank_12_mp_rank_01_optim_states.pt... +[default6]:[2023-02-16 16:42:31,561] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2750/zero_pp_rank_19_mp_rank_00_optim_states.pt... +[default6]:[2023-02-16 16:42:31,561] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2750/zero_pp_rank_3_mp_rank_00_optim_states.pt... +[default4]:[2023-02-16 16:42:31,561] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2750/zero_pp_rank_14_mp_rank_00_optim_states.pt... +[default2]:[2023-02-16 16:42:31,561] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2750/zero_pp_rank_9_mp_rank_00_optim_states.pt... +[default7]:[2023-02-16 16:42:31,561] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2750/zero_pp_rank_15_mp_rank_01_optim_states.pt... +[default3]:[2023-02-16 16:42:31,561] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2750/zero_pp_rank_13_mp_rank_01_optim_states.pt... +[default0]:[2023-02-16 16:42:32,081] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2750/zero_pp_rank_0_mp_rank_00_optim_states.pt. +[default0]:[2023-02-16 16:42:32,083] [INFO] [engine.py:3199:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2750/zero_pp_rank_0_mp_rank_00_optim_states.pt +[default0]:[2023-02-16 16:42:32,083] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2750 is ready now! +[default0]:[2023-02-16 16:42:32,113] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2750/zero_pp_rank_8_mp_rank_00_optim_states.pt. +[default0]:[2023-02-16 16:42:32,113] [INFO] [engine.py:3199:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2750/zero_pp_rank_8_mp_rank_00_optim_states.pt +[default0]:[2023-02-16 16:42:32,113] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2750 is ready now! +[default2]:[2023-02-16 16:42:32,110] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2750/zero_pp_rank_17_mp_rank_00_optim_states.pt. +[default2]:[2023-02-16 16:42:32,111] [INFO] [engine.py:3199:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2750/zero_pp_rank_17_mp_rank_00_optim_states.pt +[default2]:[2023-02-16 16:42:32,111] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2750 is ready now! +[default1]:[2023-02-16 16:42:32,091] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2750/zero_pp_rank_12_mp_rank_01_optim_states.pt. +[default1]:[2023-02-16 16:42:32,091] [INFO] [engine.py:3199:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2750/zero_pp_rank_12_mp_rank_01_optim_states.pt +[default1]:[2023-02-16 16:42:32,091] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2750 is ready now! +[default6]:[2023-02-16 16:42:32,153] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2750/zero_pp_rank_19_mp_rank_00_optim_states.pt. +[default6]:[2023-02-16 16:42:32,153] [INFO] [engine.py:3199:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2750/zero_pp_rank_19_mp_rank_00_optim_states.pt +[default6]:[2023-02-16 16:42:32,153] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2750 is ready now! +[default2]:[2023-02-16 16:42:32,106] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2750/zero_pp_rank_9_mp_rank_00_optim_states.pt. +[default2]:[2023-02-16 16:42:32,107] [INFO] [engine.py:3199:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2750/zero_pp_rank_9_mp_rank_00_optim_states.pt +[default2]:[2023-02-16 16:42:32,107] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2750 is ready now! +[default0]:[2023-02-16 16:42:32,164] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2750/zero_pp_rank_4_mp_rank_00_optim_states.pt. +[default0]:[2023-02-16 16:42:32,164] [INFO] [engine.py:3199:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2750/zero_pp_rank_4_mp_rank_00_optim_states.pt +[default0]:[2023-02-16 16:42:32,165] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2750 is ready now! +[default6]:[2023-02-16 16:42:32,111] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2750/zero_pp_rank_11_mp_rank_00_optim_states.pt. +[default6]:[2023-02-16 16:42:32,111] [INFO] [engine.py:3199:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2750/zero_pp_rank_11_mp_rank_00_optim_states.pt +[default6]:[2023-02-16 16:42:32,111] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2750 is ready now! +[default5]:[2023-02-16 16:42:32,128] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2750/zero_pp_rank_6_mp_rank_01_optim_states.pt. +[default5]:[2023-02-16 16:42:32,128] [INFO] [engine.py:3199:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2750/zero_pp_rank_6_mp_rank_01_optim_states.pt +[default5]:[2023-02-16 16:42:32,128] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2750 is ready now! +[default4]:[2023-02-16 16:42:32,090] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2750/zero_pp_rank_6_mp_rank_00_optim_states.pt. +[default4]:[2023-02-16 16:42:32,090] [INFO] [engine.py:3199:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2750/zero_pp_rank_6_mp_rank_00_optim_states.pt +[default4]:[2023-02-16 16:42:32,090] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2750 is ready now! +[default7]:[2023-02-16 16:42:32,176] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2750/zero_pp_rank_23_mp_rank_01_optim_states.pt. +[default7]:[2023-02-16 16:42:32,176] [INFO] [engine.py:3199:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2750/zero_pp_rank_23_mp_rank_01_optim_states.pt +[default7]:[2023-02-16 16:42:32,176] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2750 is ready now! +[default0]:[2023-02-16 16:42:32,135] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2750/zero_pp_rank_28_mp_rank_00_optim_states.pt. +[default0]:[2023-02-16 16:42:32,135] [INFO] [engine.py:3199:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2750/zero_pp_rank_28_mp_rank_00_optim_states.pt +[default0]:[2023-02-16 16:42:32,135] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2750 is ready now! +[default4]:[2023-02-16 16:42:32,142] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2750/zero_pp_rank_22_mp_rank_00_optim_states.pt. +[default4]:[2023-02-16 16:42:32,142] [INFO] [engine.py:3199:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2750/zero_pp_rank_22_mp_rank_00_optim_states.pt +[default4]:[2023-02-16 16:42:32,142] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2750 is ready now! +[default3]:[2023-02-16 16:42:32,144] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2750/zero_pp_rank_17_mp_rank_01_optim_states.pt. +[default3]:[2023-02-16 16:42:32,144] [INFO] [engine.py:3199:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2750/zero_pp_rank_17_mp_rank_01_optim_states.pt +[default3]:[2023-02-16 16:42:32,145] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2750 is ready now! +[default1]:[2023-02-16 16:42:32,113] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2750/zero_pp_rank_28_mp_rank_01_optim_states.pt. +[default1]:[2023-02-16 16:42:32,113] [INFO] [engine.py:3199:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2750/zero_pp_rank_28_mp_rank_01_optim_states.pt +[default1]:[2023-02-16 16:42:32,114] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2750 is ready now! +[default3]:[2023-02-16 16:42:32,181] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2750/zero_pp_rank_21_mp_rank_01_optim_states.pt. +[default3]:[2023-02-16 16:42:32,181] [INFO] [engine.py:3199:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2750/zero_pp_rank_21_mp_rank_01_optim_states.pt +[default3]:[2023-02-16 16:42:32,181] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2750 is ready now! +[default1]:[2023-02-16 16:42:32,156] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2750/zero_pp_rank_20_mp_rank_01_optim_states.pt. +[default1]:[2023-02-16 16:42:32,156] [INFO] [engine.py:3199:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2750/zero_pp_rank_20_mp_rank_01_optim_states.pt +[default1]:[2023-02-16 16:42:32,156] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2750 is ready now! +[default3]:[2023-02-16 16:42:32,131] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2750/zero_pp_rank_29_mp_rank_01_optim_states.pt. +[default3]:[2023-02-16 16:42:32,131] [INFO] [engine.py:3199:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2750/zero_pp_rank_29_mp_rank_01_optim_states.pt +[default3]:[2023-02-16 16:42:32,131] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2750 is ready now! +[default4]:[2023-02-16 16:42:32,130] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2750/zero_pp_rank_30_mp_rank_00_optim_states.pt. +[default4]:[2023-02-16 16:42:32,130] [INFO] [engine.py:3199:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2750/zero_pp_rank_30_mp_rank_00_optim_states.pt +[default4]:[2023-02-16 16:42:32,130] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2750 is ready now! +[default5]:[2023-02-16 16:42:32,208] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2750/zero_pp_rank_18_mp_rank_01_optim_states.pt. +[default5]:[2023-02-16 16:42:32,208] [INFO] [engine.py:3199:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2750/zero_pp_rank_18_mp_rank_01_optim_states.pt +[default5]:[2023-02-16 16:42:32,208] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2750 is ready now! +[default0]:[2023-02-16 16:42:32,162] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2750/zero_pp_rank_20_mp_rank_00_optim_states.pt. +[default0]:[2023-02-16 16:42:32,162] [INFO] [engine.py:3199:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2750/zero_pp_rank_20_mp_rank_00_optim_states.pt +[default0]:[2023-02-16 16:42:32,162] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2750 is ready now! +[default5]:[2023-02-16 16:42:32,227] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2750/zero_pp_rank_2_mp_rank_01_optim_states.pt. +[default5]:[2023-02-16 16:42:32,227] [INFO] [engine.py:3199:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2750/zero_pp_rank_2_mp_rank_01_optim_states.pt +[default5]:[2023-02-16 16:42:32,227] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2750 is ready now! +[default6]:[2023-02-16 16:42:32,228] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2750/zero_pp_rank_31_mp_rank_00_optim_states.pt. +[default6]:[2023-02-16 16:42:32,228] [INFO] [engine.py:3199:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2750/zero_pp_rank_31_mp_rank_00_optim_states.pt +[default6]:[2023-02-16 16:42:32,228] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2750 is ready now! +[default0]:[2023-02-16 16:42:32,203] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2750/zero_pp_rank_24_mp_rank_00_optim_states.pt. +[default0]:[2023-02-16 16:42:32,203] [INFO] [engine.py:3199:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2750/zero_pp_rank_24_mp_rank_00_optim_states.pt +[default0]:[2023-02-16 16:42:32,203] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2750 is ready now! +[default2]:[2023-02-16 16:42:32,189] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2750/zero_pp_rank_1_mp_rank_00_optim_states.pt. +[default2]:[2023-02-16 16:42:32,189] [INFO] [engine.py:3199:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2750/zero_pp_rank_1_mp_rank_00_optim_states.pt +[default2]:[2023-02-16 16:42:32,190] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2750 is ready now! +[default3]:[2023-02-16 16:42:32,235] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2750/zero_pp_rank_1_mp_rank_01_optim_states.pt. +[default3]:[2023-02-16 16:42:32,235] [INFO] [engine.py:3199:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2750/zero_pp_rank_1_mp_rank_01_optim_states.pt +[default3]:[2023-02-16 16:42:32,235] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2750 is ready now! +[default6]:[2023-02-16 16:42:32,224] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2750/zero_pp_rank_3_mp_rank_00_optim_states.pt. +[default6]:[2023-02-16 16:42:32,225] [INFO] [engine.py:3199:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2750/zero_pp_rank_3_mp_rank_00_optim_states.pt +[default6]:[2023-02-16 16:42:32,225] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2750 is ready now! +[default3]:[2023-02-16 16:42:32,242] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2750/zero_pp_rank_13_mp_rank_01_optim_states.pt. +[default3]:[2023-02-16 16:42:32,242] [INFO] [engine.py:3199:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2750/zero_pp_rank_13_mp_rank_01_optim_states.pt +[default3]:[2023-02-16 16:42:32,243] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2750 is ready now! +[default5]:[2023-02-16 16:42:32,239] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2750/zero_pp_rank_22_mp_rank_01_optim_states.pt. +[default5]:[2023-02-16 16:42:32,239] [INFO] [engine.py:3199:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2750/zero_pp_rank_22_mp_rank_01_optim_states.pt +[default5]:[2023-02-16 16:42:32,239] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2750 is ready now! +[default7]:[2023-02-16 16:42:32,177] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2750/zero_pp_rank_11_mp_rank_01_optim_states.pt. +[default7]:[2023-02-16 16:42:32,177] [INFO] [engine.py:3199:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2750/zero_pp_rank_11_mp_rank_01_optim_states.pt +[default7]:[2023-02-16 16:42:32,177] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2750 is ready now! +[default4]:[2023-02-16 16:42:32,248] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2750/zero_pp_rank_18_mp_rank_00_optim_states.pt. +[default4]:[2023-02-16 16:42:32,248] [INFO] [engine.py:3199:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2750/zero_pp_rank_18_mp_rank_00_optim_states.pt +[default4]:[2023-02-16 16:42:32,248] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2750 is ready now! +[default2]:[2023-02-16 16:42:32,214] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2750/zero_pp_rank_5_mp_rank_00_optim_states.pt. +[default2]:[2023-02-16 16:42:32,214] [INFO] [engine.py:3199:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2750/zero_pp_rank_5_mp_rank_00_optim_states.pt +[default2]:[2023-02-16 16:42:32,214] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2750 is ready now! +[default5]:[2023-02-16 16:42:32,228] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2750/zero_pp_rank_30_mp_rank_01_optim_states.pt. +[default5]:[2023-02-16 16:42:32,229] [INFO] [engine.py:3199:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2750/zero_pp_rank_30_mp_rank_01_optim_states.pt +[default5]:[2023-02-16 16:42:32,229] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2750 is ready now! +[default1]:[2023-02-16 16:42:32,201] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2750/zero_pp_rank_4_mp_rank_01_optim_states.pt. +[default1]:[2023-02-16 16:42:32,201] [INFO] [engine.py:3199:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2750/zero_pp_rank_4_mp_rank_01_optim_states.pt +[default1]:[2023-02-16 16:42:32,201] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2750 is ready now! +[default5]:[2023-02-16 16:42:32,212] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2750/zero_pp_rank_10_mp_rank_01_optim_states.pt. +[default5]:[2023-02-16 16:42:32,212] [INFO] [engine.py:3199:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2750/zero_pp_rank_10_mp_rank_01_optim_states.pt +[default5]:[2023-02-16 16:42:32,212] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2750 is ready now! +[default1]:[2023-02-16 16:42:32,249] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2750/zero_pp_rank_16_mp_rank_01_optim_states.pt. +[default1]:[2023-02-16 16:42:32,249] [INFO] [engine.py:3199:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2750/zero_pp_rank_16_mp_rank_01_optim_states.pt +[default1]:[2023-02-16 16:42:32,249] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2750 is ready now! +[default2]:[2023-02-16 16:42:32,232] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2750/zero_pp_rank_29_mp_rank_00_optim_states.pt. +[default2]:[2023-02-16 16:42:32,232] [INFO] [engine.py:3199:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2750/zero_pp_rank_29_mp_rank_00_optim_states.pt +[default2]:[2023-02-16 16:42:32,232] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2750 is ready now! +[default4]:[2023-02-16 16:42:32,214] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2750/zero_pp_rank_10_mp_rank_00_optim_states.pt. +[default4]:[2023-02-16 16:42:32,214] [INFO] [engine.py:3199:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2750/zero_pp_rank_10_mp_rank_00_optim_states.pt +[default4]:[2023-02-16 16:42:32,214] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2750 is ready now! +[default1]:[2023-02-16 16:42:32,245] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2750/zero_pp_rank_0_mp_rank_01_optim_states.pt. +[default1]:[2023-02-16 16:42:32,245] [INFO] [engine.py:3199:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2750/zero_pp_rank_0_mp_rank_01_optim_states.pt +[default1]:[2023-02-16 16:42:32,245] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2750 is ready now! +[default2]:[2023-02-16 16:42:32,193] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2750/zero_pp_rank_21_mp_rank_00_optim_states.pt. +[default2]:[2023-02-16 16:42:32,194] [INFO] [engine.py:3199:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2750/zero_pp_rank_21_mp_rank_00_optim_states.pt +[default2]:[2023-02-16 16:42:32,194] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2750 is ready now! +[default3]:[2023-02-16 16:42:32,238] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2750/zero_pp_rank_9_mp_rank_01_optim_states.pt. +[default3]:[2023-02-16 16:42:32,238] [INFO] [engine.py:3199:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2750/zero_pp_rank_9_mp_rank_01_optim_states.pt +[default3]:[2023-02-16 16:42:32,238] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2750 is ready now! +[default7]:[2023-02-16 16:42:32,241] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2750/zero_pp_rank_7_mp_rank_01_optim_states.pt. +[default7]:[2023-02-16 16:42:32,241] [INFO] [engine.py:3199:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2750/zero_pp_rank_7_mp_rank_01_optim_states.pt +[default7]:[2023-02-16 16:42:32,241] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2750 is ready now! +[default6]:[2023-02-16 16:42:32,221] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2750/zero_pp_rank_23_mp_rank_00_optim_states.pt. +[default6]:[2023-02-16 16:42:32,221] [INFO] [engine.py:3199:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2750/zero_pp_rank_23_mp_rank_00_optim_states.pt +[default6]:[2023-02-16 16:42:32,221] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2750 is ready now! +[default7]:[2023-02-16 16:42:32,224] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2750/zero_pp_rank_31_mp_rank_01_optim_states.pt. +[default7]:[2023-02-16 16:42:32,224] [INFO] [engine.py:3199:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2750/zero_pp_rank_31_mp_rank_01_optim_states.pt +[default7]:[2023-02-16 16:42:32,224] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2750 is ready now! +[default3]:[2023-02-16 16:42:32,247] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2750/zero_pp_rank_5_mp_rank_01_optim_states.pt. +[default3]:[2023-02-16 16:42:32,248] [INFO] [engine.py:3199:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2750/zero_pp_rank_5_mp_rank_01_optim_states.pt +[default3]:[2023-02-16 16:42:32,248] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2750 is ready now! +[default0]:[2023-02-16 16:42:32,244] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2750/zero_pp_rank_12_mp_rank_00_optim_states.pt. +[default0]:[2023-02-16 16:42:32,244] [INFO] [engine.py:3199:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2750/zero_pp_rank_12_mp_rank_00_optim_states.pt +[default0]:[2023-02-16 16:42:32,244] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2750 is ready now! +[default6]:[2023-02-16 16:42:32,211] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2750/zero_pp_rank_7_mp_rank_00_optim_states.pt. +[default6]:[2023-02-16 16:42:32,211] [INFO] [engine.py:3199:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2750/zero_pp_rank_7_mp_rank_00_optim_states.pt +[default6]:[2023-02-16 16:42:32,212] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2750 is ready now! +[default1]:[2023-02-16 16:42:32,210] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2750/zero_pp_rank_8_mp_rank_01_optim_states.pt. +[default1]:[2023-02-16 16:42:32,211] [INFO] [engine.py:3199:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2750/zero_pp_rank_8_mp_rank_01_optim_states.pt +[default1]:[2023-02-16 16:42:32,211] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2750 is ready now! +[default0]:[2023-02-16 16:42:32,264] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2750/zero_pp_rank_16_mp_rank_00_optim_states.pt. +[default0]:[2023-02-16 16:42:32,264] [INFO] [engine.py:3199:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2750/zero_pp_rank_16_mp_rank_00_optim_states.pt +[default0]:[2023-02-16 16:42:32,264] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2750 is ready now! +[default4]:[2023-02-16 16:42:32,229] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2750/zero_pp_rank_2_mp_rank_00_optim_states.pt. +[default4]:[2023-02-16 16:42:32,229] [INFO] [engine.py:3199:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2750/zero_pp_rank_2_mp_rank_00_optim_states.pt +[default4]:[2023-02-16 16:42:32,229] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2750 is ready now! +[default7]:[2023-02-16 16:42:32,252] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2750/zero_pp_rank_3_mp_rank_01_optim_states.pt. +[default7]:[2023-02-16 16:42:32,252] [INFO] [engine.py:3199:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2750/zero_pp_rank_3_mp_rank_01_optim_states.pt +[default7]:[2023-02-16 16:42:32,252] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2750 is ready now! +[default7]:[2023-02-16 16:42:32,248] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2750/zero_pp_rank_19_mp_rank_01_optim_states.pt. +[default7]:[2023-02-16 16:42:32,248] [INFO] [engine.py:3199:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2750/zero_pp_rank_19_mp_rank_01_optim_states.pt +[default7]:[2023-02-16 16:42:32,248] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2750 is ready now! +[default6]:[2023-02-16 16:42:32,322] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2750/zero_pp_rank_27_mp_rank_00_optim_states.pt. +[default6]:[2023-02-16 16:42:32,322] [INFO] [engine.py:3199:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2750/zero_pp_rank_27_mp_rank_00_optim_states.pt +[default6]:[2023-02-16 16:42:32,322] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2750 is ready now! +[default2]:[2023-02-16 16:42:32,347] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2750/zero_pp_rank_25_mp_rank_00_optim_states.pt. +[default2]:[2023-02-16 16:42:32,347] [INFO] [engine.py:3199:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2750/zero_pp_rank_25_mp_rank_00_optim_states.pt +[default2]:[2023-02-16 16:42:32,347] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2750 is ready now! +[default7]:[2023-02-16 16:42:32,323] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2750/zero_pp_rank_27_mp_rank_01_optim_states.pt. +[default7]:[2023-02-16 16:42:32,323] [INFO] [engine.py:3199:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2750/zero_pp_rank_27_mp_rank_01_optim_states.pt +[default7]:[2023-02-16 16:42:32,323] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2750 is ready now! +[default5]:[2023-02-16 16:42:32,335] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2750/zero_pp_rank_26_mp_rank_01_optim_states.pt. +[default5]:[2023-02-16 16:42:32,335] [INFO] [engine.py:3199:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2750/zero_pp_rank_26_mp_rank_01_optim_states.pt +[default5]:[2023-02-16 16:42:32,335] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2750 is ready now! +[default1]:[2023-02-16 16:42:32,311] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2750/zero_pp_rank_24_mp_rank_01_optim_states.pt. +[default1]:[2023-02-16 16:42:32,312] [INFO] [engine.py:3199:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2750/zero_pp_rank_24_mp_rank_01_optim_states.pt +[default1]:[2023-02-16 16:42:32,312] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2750 is ready now! +[default3]:[2023-02-16 16:42:32,336] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2750/zero_pp_rank_25_mp_rank_01_optim_states.pt. +[default3]:[2023-02-16 16:42:32,336] [INFO] [engine.py:3199:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2750/zero_pp_rank_25_mp_rank_01_optim_states.pt +[default3]:[2023-02-16 16:42:32,336] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2750 is ready now! +[default4]:[2023-02-16 16:42:32,414] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2750/zero_pp_rank_26_mp_rank_00_optim_states.pt. +[default4]:[2023-02-16 16:42:32,414] [INFO] [engine.py:3199:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2750/zero_pp_rank_26_mp_rank_00_optim_states.pt +[default4]:[2023-02-16 16:42:32,414] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2750 is ready now! +[default5]:[2023-02-16 16:42:32,406] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2750/zero_pp_rank_14_mp_rank_01_optim_states.pt. +[default5]:[2023-02-16 16:42:32,406] [INFO] [engine.py:3199:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2750/zero_pp_rank_14_mp_rank_01_optim_states.pt +[default5]:[2023-02-16 16:42:32,406] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2750 is ready now! +[default2]:[2023-02-16 16:42:32,460] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2750/zero_pp_rank_13_mp_rank_00_optim_states.pt. +[default2]:[2023-02-16 16:42:32,460] [INFO] [engine.py:3199:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2750/zero_pp_rank_13_mp_rank_00_optim_states.pt +[default2]:[2023-02-16 16:42:32,460] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2750 is ready now! +[default4]:[2023-02-16 16:42:32,486] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2750/zero_pp_rank_14_mp_rank_00_optim_states.pt. +[default4]:[2023-02-16 16:42:32,486] [INFO] [engine.py:3199:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2750/zero_pp_rank_14_mp_rank_00_optim_states.pt +[default4]:[2023-02-16 16:42:32,486] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2750 is ready now! +[default7]:[2023-02-16 16:42:32,505] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2750/zero_pp_rank_15_mp_rank_01_optim_states.pt. +[default7]:[2023-02-16 16:42:32,506] [INFO] [engine.py:3199:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2750/zero_pp_rank_15_mp_rank_01_optim_states.pt +[default7]:[2023-02-16 16:42:32,506] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2750 is ready now! +[default6]:[2023-02-16 16:42:32,555] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2750/zero_pp_rank_15_mp_rank_00_optim_states.pt. +[default6]:[2023-02-16 16:42:32,555] [INFO] [engine.py:3199:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2750/zero_pp_rank_15_mp_rank_00_optim_states.pt +[default6]:[2023-02-16 16:42:32,555] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2750 is ready now! +[default0]: successfully saved checkpoint at iteration 2750 to /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main +[default7]:time (ms) | save-checkpoint: 3109.96 +[default7]: iteration 2751/ 3814 | consumed samples: 1408512 | consumed tokens: 2884632576 | elapsed time per iteration (s): 8.55 | learning rate: 2.160E-05 | global batch size: 512 | lm loss: 4.943509E+00 | loss scale: 131072.0 | grad norm: 0.120 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 59.899 | TFLOPs: 45.24 | +[default7]: iteration 2752/ 3814 | consumed samples: 1409024 | consumed tokens: 2885681152 | elapsed time per iteration (s): 5.44 | learning rate: 2.156E-05 | global batch size: 512 | lm loss: 4.952957E+00 | loss scale: 131072.0 | grad norm: 0.135 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.111 | TFLOPs: 71.08 | +[default7]: iteration 2753/ 3814 | consumed samples: 1409536 | consumed tokens: 2886729728 | elapsed time per iteration (s): 5.47 | learning rate: 2.152E-05 | global batch size: 512 | lm loss: 4.946165E+00 | loss scale: 131072.0 | grad norm: 0.138 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.636 | TFLOPs: 70.72 | +[default7]: iteration 2754/ 3814 | consumed samples: 1410048 | consumed tokens: 2887778304 | elapsed time per iteration (s): 5.46 | learning rate: 2.148E-05 | global batch size: 512 | lm loss: 4.955083E+00 | loss scale: 131072.0 | grad norm: 0.132 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.741 | TFLOPs: 70.80 | +[default7]: iteration 2755/ 3814 | consumed samples: 1410560 | consumed tokens: 2888826880 | elapsed time per iteration (s): 5.43 | learning rate: 2.145E-05 | global batch size: 512 | lm loss: 4.926733E+00 | loss scale: 131072.0 | grad norm: 0.108 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.313 | TFLOPs: 71.24 | +[default7]: iteration 2756/ 3814 | consumed samples: 1411072 | consumed tokens: 2889875456 | elapsed time per iteration (s): 5.44 | learning rate: 2.141E-05 | global batch size: 512 | lm loss: 4.931113E+00 | loss scale: 131072.0 | grad norm: 0.114 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.180 | TFLOPs: 71.14 | +[default7]: iteration 2757/ 3814 | consumed samples: 1411584 | consumed tokens: 2890924032 | elapsed time per iteration (s): 5.42 | learning rate: 2.137E-05 | global batch size: 512 | lm loss: 4.926753E+00 | loss scale: 131072.0 | grad norm: 0.100 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.482 | TFLOPs: 71.36 | +[default7]: iteration 2758/ 3814 | consumed samples: 1412096 | consumed tokens: 2891972608 | elapsed time per iteration (s): 5.49 | learning rate: 2.133E-05 | global batch size: 512 | lm loss: 4.920405E+00 | loss scale: 131072.0 | grad norm: 0.104 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.246 | TFLOPs: 70.43 | +[default7]: iteration 2759/ 3814 | consumed samples: 1412608 | consumed tokens: 2893021184 | elapsed time per iteration (s): 5.47 | learning rate: 2.130E-05 | global batch size: 512 | lm loss: 4.963267E+00 | loss scale: 131072.0 | grad norm: 0.128 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.584 | TFLOPs: 70.69 | +[default7]: iteration 2760/ 3814 | consumed samples: 1413120 | consumed tokens: 2894069760 | elapsed time per iteration (s): 5.44 | learning rate: 2.126E-05 | global batch size: 512 | lm loss: 4.926258E+00 | loss scale: 131072.0 | grad norm: 0.131 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.087 | TFLOPs: 71.06 | +[default7]: iteration 2761/ 3814 | consumed samples: 1413632 | consumed tokens: 2895118336 | elapsed time per iteration (s): 5.45 | learning rate: 2.122E-05 | global batch size: 512 | lm loss: 4.952419E+00 | loss scale: 131072.0 | grad norm: 0.105 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.946 | TFLOPs: 70.96 | +[default7]: iteration 2762/ 3814 | consumed samples: 1414144 | consumed tokens: 2896166912 | elapsed time per iteration (s): 5.43 | learning rate: 2.119E-05 | global batch size: 512 | lm loss: 4.943697E+00 | loss scale: 131072.0 | grad norm: 0.106 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.206 | TFLOPs: 71.15 | +[default7]: iteration 2763/ 3814 | consumed samples: 1414656 | consumed tokens: 2897215488 | elapsed time per iteration (s): 5.42 | learning rate: 2.115E-05 | global batch size: 512 | lm loss: 4.919253E+00 | loss scale: 131072.0 | grad norm: 0.102 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.420 | TFLOPs: 71.32 | +[default7]: iteration 2764/ 3814 | consumed samples: 1415168 | consumed tokens: 2898264064 | elapsed time per iteration (s): 5.43 | learning rate: 2.111E-05 | global batch size: 512 | lm loss: 4.944034E+00 | loss scale: 131072.0 | grad norm: 0.137 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.291 | TFLOPs: 71.22 | +[default7]: iteration 2765/ 3814 | consumed samples: 1415680 | consumed tokens: 2899312640 | elapsed time per iteration (s): 5.42 | learning rate: 2.107E-05 | global batch size: 512 | lm loss: 4.948040E+00 | loss scale: 131072.0 | grad norm: 0.131 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.471 | TFLOPs: 71.35 | +[default7]: iteration 2766/ 3814 | consumed samples: 1416192 | consumed tokens: 2900361216 | elapsed time per iteration (s): 5.44 | learning rate: 2.104E-05 | global batch size: 512 | lm loss: 4.934587E+00 | loss scale: 131072.0 | grad norm: 0.119 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.126 | TFLOPs: 71.09 | +[default7]: iteration 2767/ 3814 | consumed samples: 1416704 | consumed tokens: 2901409792 | elapsed time per iteration (s): 5.44 | learning rate: 2.100E-05 | global batch size: 512 | lm loss: 4.951807E+00 | loss scale: 131072.0 | grad norm: 0.132 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.140 | TFLOPs: 71.10 | +[default7]: iteration 2768/ 3814 | consumed samples: 1417216 | consumed tokens: 2902458368 | elapsed time per iteration (s): 5.44 | learning rate: 2.096E-05 | global batch size: 512 | lm loss: 4.943824E+00 | loss scale: 131072.0 | grad norm: 0.129 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.151 | TFLOPs: 71.11 | +[default7]: iteration 2769/ 3814 | consumed samples: 1417728 | consumed tokens: 2903506944 | elapsed time per iteration (s): 5.43 | learning rate: 2.093E-05 | global batch size: 512 | lm loss: 4.925424E+00 | loss scale: 131072.0 | grad norm: 0.095 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.246 | TFLOPs: 71.18 | +[default7]: iteration 2770/ 3814 | consumed samples: 1418240 | consumed tokens: 2904555520 | elapsed time per iteration (s): 5.46 | learning rate: 2.089E-05 | global batch size: 512 | lm loss: 4.936647E+00 | loss scale: 131072.0 | grad norm: 0.148 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.804 | TFLOPs: 70.85 | +[default7]: iteration 2771/ 3814 | consumed samples: 1418752 | consumed tokens: 2905604096 | elapsed time per iteration (s): 5.45 | learning rate: 2.085E-05 | global batch size: 512 | lm loss: 4.955808E+00 | loss scale: 131072.0 | grad norm: 0.127 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.868 | TFLOPs: 70.90 | +[default7]: iteration 2772/ 3814 | consumed samples: 1419264 | consumed tokens: 2906652672 | elapsed time per iteration (s): 5.41 | learning rate: 2.082E-05 | global batch size: 512 | lm loss: 4.946371E+00 | loss scale: 131072.0 | grad norm: 0.113 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.611 | TFLOPs: 71.46 | +[default7]: iteration 2773/ 3814 | consumed samples: 1419776 | consumed tokens: 2907701248 | elapsed time per iteration (s): 5.43 | learning rate: 2.078E-05 | global batch size: 512 | lm loss: 4.958283E+00 | loss scale: 131072.0 | grad norm: 0.142 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.258 | TFLOPs: 71.19 | +[default7]: iteration 2774/ 3814 | consumed samples: 1420288 | consumed tokens: 2908749824 | elapsed time per iteration (s): 5.46 | learning rate: 2.074E-05 | global batch size: 512 | lm loss: 4.953967E+00 | loss scale: 131072.0 | grad norm: 0.110 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.748 | TFLOPs: 70.81 | +[default7]: iteration 2775/ 3814 | consumed samples: 1420800 | consumed tokens: 2909798400 | elapsed time per iteration (s): 5.47 | learning rate: 2.071E-05 | global batch size: 512 | lm loss: 4.934259E+00 | loss scale: 131072.0 | grad norm: 0.112 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.677 | TFLOPs: 70.76 | +[default7]: iteration 2776/ 3814 | consumed samples: 1421312 | consumed tokens: 2910846976 | elapsed time per iteration (s): 5.45 | learning rate: 2.067E-05 | global batch size: 512 | lm loss: 4.952403E+00 | loss scale: 131072.0 | grad norm: 0.105 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.011 | TFLOPs: 71.01 | +[default7]: iteration 2777/ 3814 | consumed samples: 1421824 | consumed tokens: 2911895552 | elapsed time per iteration (s): 5.44 | learning rate: 2.063E-05 | global batch size: 512 | lm loss: 4.953316E+00 | loss scale: 131072.0 | grad norm: 0.148 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.047 | TFLOPs: 71.04 | +[default7]: iteration 2778/ 3814 | consumed samples: 1422336 | consumed tokens: 2912944128 | elapsed time per iteration (s): 5.43 | learning rate: 2.060E-05 | global batch size: 512 | lm loss: 4.957504E+00 | loss scale: 131072.0 | grad norm: 0.126 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.246 | TFLOPs: 71.18 | +[default7]: iteration 2779/ 3814 | consumed samples: 1422848 | consumed tokens: 2913992704 | elapsed time per iteration (s): 5.42 | learning rate: 2.056E-05 | global batch size: 512 | lm loss: 4.930659E+00 | loss scale: 131072.0 | grad norm: 0.111 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.442 | TFLOPs: 71.33 | +[default7]: iteration 2780/ 3814 | consumed samples: 1423360 | consumed tokens: 2915041280 | elapsed time per iteration (s): 5.43 | learning rate: 2.052E-05 | global batch size: 512 | lm loss: 4.943465E+00 | loss scale: 131072.0 | grad norm: 0.137 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.325 | TFLOPs: 71.24 | +[default7]: iteration 2781/ 3814 | consumed samples: 1423872 | consumed tokens: 2916089856 | elapsed time per iteration (s): 5.43 | learning rate: 2.049E-05 | global batch size: 512 | lm loss: 4.945817E+00 | loss scale: 131072.0 | grad norm: 0.111 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.230 | TFLOPs: 71.17 | +[default7]: iteration 2782/ 3814 | consumed samples: 1424384 | consumed tokens: 2917138432 | elapsed time per iteration (s): 5.43 | learning rate: 2.045E-05 | global batch size: 512 | lm loss: 4.926280E+00 | loss scale: 131072.0 | grad norm: 0.152 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.254 | TFLOPs: 71.19 | +[default7]: iteration 2783/ 3814 | consumed samples: 1424896 | consumed tokens: 2918187008 | elapsed time per iteration (s): 5.44 | learning rate: 2.041E-05 | global batch size: 512 | lm loss: 4.914886E+00 | loss scale: 131072.0 | grad norm: 0.142 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.119 | TFLOPs: 71.09 | +[default7]: iteration 2784/ 3814 | consumed samples: 1425408 | consumed tokens: 2919235584 | elapsed time per iteration (s): 5.44 | learning rate: 2.038E-05 | global batch size: 512 | lm loss: 4.966457E+00 | loss scale: 131072.0 | grad norm: 0.119 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.113 | TFLOPs: 71.08 | +[default7]: iteration 2785/ 3814 | consumed samples: 1425920 | consumed tokens: 2920284160 | elapsed time per iteration (s): 5.45 | learning rate: 2.034E-05 | global batch size: 512 | lm loss: 4.951486E+00 | loss scale: 131072.0 | grad norm: 0.100 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.932 | TFLOPs: 70.95 | +[default7]: iteration 2786/ 3814 | consumed samples: 1426432 | consumed tokens: 2921332736 | elapsed time per iteration (s): 5.45 | learning rate: 2.030E-05 | global batch size: 512 | lm loss: 4.930329E+00 | loss scale: 131072.0 | grad norm: 0.113 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.945 | TFLOPs: 70.96 | +[default7]: iteration 2787/ 3814 | consumed samples: 1426944 | consumed tokens: 2922381312 | elapsed time per iteration (s): 5.44 | learning rate: 2.027E-05 | global batch size: 512 | lm loss: 4.930175E+00 | loss scale: 131072.0 | grad norm: 0.120 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.092 | TFLOPs: 71.07 | +[default7]: iteration 2788/ 3814 | consumed samples: 1427456 | consumed tokens: 2923429888 | elapsed time per iteration (s): 5.43 | learning rate: 2.023E-05 | global batch size: 512 | lm loss: 4.933548E+00 | loss scale: 131072.0 | grad norm: 0.102 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.206 | TFLOPs: 71.15 | +[default7]: iteration 2789/ 3814 | consumed samples: 1427968 | consumed tokens: 2924478464 | elapsed time per iteration (s): 5.42 | learning rate: 2.019E-05 | global batch size: 512 | lm loss: 4.947808E+00 | loss scale: 131072.0 | grad norm: 0.125 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.522 | TFLOPs: 71.39 | +[default7]: iteration 2790/ 3814 | consumed samples: 1428480 | consumed tokens: 2925527040 | elapsed time per iteration (s): 5.42 | learning rate: 2.016E-05 | global batch size: 512 | lm loss: 4.959419E+00 | loss scale: 131072.0 | grad norm: 0.125 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.397 | TFLOPs: 71.30 | +[default7]: iteration 2791/ 3814 | consumed samples: 1428992 | consumed tokens: 2926575616 | elapsed time per iteration (s): 5.44 | learning rate: 2.012E-05 | global batch size: 512 | lm loss: 4.959277E+00 | loss scale: 131072.0 | grad norm: 0.110 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.128 | TFLOPs: 71.10 | +[default7]: iteration 2792/ 3814 | consumed samples: 1429504 | consumed tokens: 2927624192 | elapsed time per iteration (s): 5.45 | learning rate: 2.008E-05 | global batch size: 512 | lm loss: 4.930482E+00 | loss scale: 131072.0 | grad norm: 0.117 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.022 | TFLOPs: 71.02 | +[default7]: iteration 2793/ 3814 | consumed samples: 1430016 | consumed tokens: 2928672768 | elapsed time per iteration (s): 5.43 | learning rate: 2.005E-05 | global batch size: 512 | lm loss: 4.933002E+00 | loss scale: 131072.0 | grad norm: 0.113 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.306 | TFLOPs: 71.23 | +[default7]: iteration 2794/ 3814 | consumed samples: 1430528 | consumed tokens: 2929721344 | elapsed time per iteration (s): 5.43 | learning rate: 2.001E-05 | global batch size: 512 | lm loss: 4.967683E+00 | loss scale: 131072.0 | grad norm: 0.114 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.303 | TFLOPs: 71.23 | +[default7]: iteration 2795/ 3814 | consumed samples: 1431040 | consumed tokens: 2930769920 | elapsed time per iteration (s): 5.46 | learning rate: 1.997E-05 | global batch size: 512 | lm loss: 4.933359E+00 | loss scale: 131072.0 | grad norm: 0.125 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.757 | TFLOPs: 70.82 | +[default7]: iteration 2796/ 3814 | consumed samples: 1431552 | consumed tokens: 2931818496 | elapsed time per iteration (s): 5.46 | learning rate: 1.994E-05 | global batch size: 512 | lm loss: 4.908298E+00 | loss scale: 131072.0 | grad norm: 0.126 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.821 | TFLOPs: 70.86 | +[default7]: iteration 2797/ 3814 | consumed samples: 1432064 | consumed tokens: 2932867072 | elapsed time per iteration (s): 5.46 | learning rate: 1.990E-05 | global batch size: 512 | lm loss: 4.935935E+00 | loss scale: 131072.0 | grad norm: 0.124 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.726 | TFLOPs: 70.79 | +[default7]: iteration 2798/ 3814 | consumed samples: 1432576 | consumed tokens: 2933915648 | elapsed time per iteration (s): 5.45 | learning rate: 1.987E-05 | global batch size: 512 | lm loss: 4.927792E+00 | loss scale: 131072.0 | grad norm: 0.111 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.015 | TFLOPs: 71.01 | +[default7]: iteration 2799/ 3814 | consumed samples: 1433088 | consumed tokens: 2934964224 | elapsed time per iteration (s): 5.46 | learning rate: 1.983E-05 | global batch size: 512 | lm loss: 4.951530E+00 | loss scale: 131072.0 | grad norm: 0.132 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.745 | TFLOPs: 70.81 | +[default7]: iteration 2800/ 3814 | consumed samples: 1433600 | consumed tokens: 2936012800 | elapsed time per iteration (s): 5.43 | learning rate: 1.979E-05 | global batch size: 512 | lm loss: 4.956230E+00 | loss scale: 131072.0 | grad norm: 0.123 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.364 | TFLOPs: 71.27 | +[default7]: iteration 2801/ 3814 | consumed samples: 1434112 | consumed tokens: 2937061376 | elapsed time per iteration (s): 5.43 | learning rate: 1.976E-05 | global batch size: 512 | lm loss: 4.928400E+00 | loss scale: 131072.0 | grad norm: 0.144 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.245 | TFLOPs: 71.18 | +[default7]: iteration 2802/ 3814 | consumed samples: 1434624 | consumed tokens: 2938109952 | elapsed time per iteration (s): 5.43 | learning rate: 1.972E-05 | global batch size: 512 | lm loss: 4.942197E+00 | loss scale: 131072.0 | grad norm: 0.124 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.374 | TFLOPs: 71.28 | +[default7]: iteration 2803/ 3814 | consumed samples: 1435136 | consumed tokens: 2939158528 | elapsed time per iteration (s): 5.44 | learning rate: 1.968E-05 | global batch size: 512 | lm loss: 4.963600E+00 | loss scale: 131072.0 | grad norm: 0.169 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.147 | TFLOPs: 71.11 | +[default7]: iteration 2804/ 3814 | consumed samples: 1435648 | consumed tokens: 2940207104 | elapsed time per iteration (s): 5.45 | learning rate: 1.965E-05 | global batch size: 512 | lm loss: 4.931043E+00 | loss scale: 131072.0 | grad norm: 0.164 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.895 | TFLOPs: 70.92 | +[default7]: iteration 2805/ 3814 | consumed samples: 1436160 | consumed tokens: 2941255680 | elapsed time per iteration (s): 5.44 | learning rate: 1.961E-05 | global batch size: 512 | lm loss: 4.924025E+00 | loss scale: 131072.0 | grad norm: 0.105 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.056 | TFLOPs: 71.04 | +[default7]: iteration 2806/ 3814 | consumed samples: 1436672 | consumed tokens: 2942304256 | elapsed time per iteration (s): 5.46 | learning rate: 1.958E-05 | global batch size: 512 | lm loss: 4.930665E+00 | loss scale: 131072.0 | grad norm: 0.137 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.749 | TFLOPs: 70.81 | +[default7]: iteration 2807/ 3814 | consumed samples: 1437184 | consumed tokens: 2943352832 | elapsed time per iteration (s): 5.46 | learning rate: 1.954E-05 | global batch size: 512 | lm loss: 4.954532E+00 | loss scale: 131072.0 | grad norm: 0.109 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.851 | TFLOPs: 70.89 | +[default7]: iteration 2808/ 3814 | consumed samples: 1437696 | consumed tokens: 2944401408 | elapsed time per iteration (s): 5.45 | learning rate: 1.950E-05 | global batch size: 512 | lm loss: 4.944182E+00 | loss scale: 131072.0 | grad norm: 0.116 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.925 | TFLOPs: 70.94 | +[default7]: iteration 2809/ 3814 | consumed samples: 1438208 | consumed tokens: 2945449984 | elapsed time per iteration (s): 5.46 | learning rate: 1.947E-05 | global batch size: 512 | lm loss: 4.935406E+00 | loss scale: 131072.0 | grad norm: 0.090 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.757 | TFLOPs: 70.82 | +[default7]: iteration 2810/ 3814 | consumed samples: 1438720 | consumed tokens: 2946498560 | elapsed time per iteration (s): 5.47 | learning rate: 1.943E-05 | global batch size: 512 | lm loss: 4.922543E+00 | loss scale: 131072.0 | grad norm: 0.099 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.619 | TFLOPs: 70.71 | +[default7]: iteration 2811/ 3814 | consumed samples: 1439232 | consumed tokens: 2947547136 | elapsed time per iteration (s): 5.45 | learning rate: 1.940E-05 | global batch size: 512 | lm loss: 4.942382E+00 | loss scale: 131072.0 | grad norm: 0.095 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.959 | TFLOPs: 70.97 | +[default7]: iteration 2812/ 3814 | consumed samples: 1439744 | consumed tokens: 2948595712 | elapsed time per iteration (s): 5.46 | learning rate: 1.936E-05 | global batch size: 512 | lm loss: 4.949736E+00 | loss scale: 131072.0 | grad norm: 0.108 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.853 | TFLOPs: 70.89 | +[default7]: iteration 2813/ 3814 | consumed samples: 1440256 | consumed tokens: 2949644288 | elapsed time per iteration (s): 5.46 | learning rate: 1.932E-05 | global batch size: 512 | lm loss: 4.949234E+00 | loss scale: 131072.0 | grad norm: 0.106 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.823 | TFLOPs: 70.87 | +[default7]: iteration 2814/ 3814 | consumed samples: 1440768 | consumed tokens: 2950692864 | elapsed time per iteration (s): 5.45 | learning rate: 1.929E-05 | global batch size: 512 | lm loss: 4.957901E+00 | loss scale: 131072.0 | grad norm: 0.102 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.900 | TFLOPs: 70.92 | +[default7]: iteration 2815/ 3814 | consumed samples: 1441280 | consumed tokens: 2951741440 | elapsed time per iteration (s): 5.46 | learning rate: 1.925E-05 | global batch size: 512 | lm loss: 4.948306E+00 | loss scale: 131072.0 | grad norm: 0.107 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.805 | TFLOPs: 70.85 | +[default7]: iteration 2816/ 3814 | consumed samples: 1441792 | consumed tokens: 2952790016 | elapsed time per iteration (s): 5.45 | learning rate: 1.922E-05 | global batch size: 512 | lm loss: 4.929760E+00 | loss scale: 131072.0 | grad norm: 0.096 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.866 | TFLOPs: 70.90 | +[default7]: iteration 2817/ 3814 | consumed samples: 1442304 | consumed tokens: 2953838592 | elapsed time per iteration (s): 5.44 | learning rate: 1.918E-05 | global batch size: 512 | lm loss: 4.941677E+00 | loss scale: 131072.0 | grad norm: 0.103 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.031 | TFLOPs: 71.02 | +[default7]: iteration 2818/ 3814 | consumed samples: 1442816 | consumed tokens: 2954887168 | elapsed time per iteration (s): 5.45 | learning rate: 1.914E-05 | global batch size: 512 | lm loss: 4.934575E+00 | loss scale: 131072.0 | grad norm: 0.125 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.864 | TFLOPs: 70.90 | +[default7]: iteration 2819/ 3814 | consumed samples: 1443328 | consumed tokens: 2955935744 | elapsed time per iteration (s): 5.44 | learning rate: 1.911E-05 | global batch size: 512 | lm loss: 4.927119E+00 | loss scale: 131072.0 | grad norm: 0.115 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.114 | TFLOPs: 71.09 | +[default7]: iteration 2820/ 3814 | consumed samples: 1443840 | consumed tokens: 2956984320 | elapsed time per iteration (s): 5.43 | learning rate: 1.907E-05 | global batch size: 512 | lm loss: 4.927861E+00 | loss scale: 131072.0 | grad norm: 0.117 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.264 | TFLOPs: 71.20 | +[default7]: iteration 2821/ 3814 | consumed samples: 1444352 | consumed tokens: 2958032896 | elapsed time per iteration (s): 5.45 | learning rate: 1.904E-05 | global batch size: 512 | lm loss: 4.948848E+00 | loss scale: 131072.0 | grad norm: 0.141 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.025 | TFLOPs: 71.02 | +[default7]: iteration 2822/ 3814 | consumed samples: 1444864 | consumed tokens: 2959081472 | elapsed time per iteration (s): 5.46 | learning rate: 1.900E-05 | global batch size: 512 | lm loss: 4.940724E+00 | loss scale: 131072.0 | grad norm: 0.103 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.688 | TFLOPs: 70.76 | +[default7]: iteration 2823/ 3814 | consumed samples: 1445376 | consumed tokens: 2960130048 | elapsed time per iteration (s): 5.44 | learning rate: 1.897E-05 | global batch size: 512 | lm loss: 4.938113E+00 | loss scale: 131072.0 | grad norm: 0.135 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.164 | TFLOPs: 71.12 | +[default7]: iteration 2824/ 3814 | consumed samples: 1445888 | consumed tokens: 2961178624 | elapsed time per iteration (s): 5.45 | learning rate: 1.893E-05 | global batch size: 512 | lm loss: 4.923431E+00 | loss scale: 131072.0 | grad norm: 0.141 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.010 | TFLOPs: 71.01 | +[default7]: iteration 2825/ 3814 | consumed samples: 1446400 | consumed tokens: 2962227200 | elapsed time per iteration (s): 5.44 | learning rate: 1.890E-05 | global batch size: 512 | lm loss: 4.935600E+00 | loss scale: 131072.0 | grad norm: 0.124 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.116 | TFLOPs: 71.09 | +[default7]: iteration 2826/ 3814 | consumed samples: 1446912 | consumed tokens: 2963275776 | elapsed time per iteration (s): 5.44 | learning rate: 1.886E-05 | global batch size: 512 | lm loss: 4.931447E+00 | loss scale: 131072.0 | grad norm: 0.115 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.073 | TFLOPs: 71.05 | +[default7]: iteration 2827/ 3814 | consumed samples: 1447424 | consumed tokens: 2964324352 | elapsed time per iteration (s): 5.44 | learning rate: 1.882E-05 | global batch size: 512 | lm loss: 4.951434E+00 | loss scale: 131072.0 | grad norm: 0.149 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.194 | TFLOPs: 71.15 | +[default7]: iteration 2828/ 3814 | consumed samples: 1447936 | consumed tokens: 2965372928 | elapsed time per iteration (s): 5.45 | learning rate: 1.879E-05 | global batch size: 512 | lm loss: 4.910131E+00 | loss scale: 131072.0 | grad norm: 0.112 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.000 | TFLOPs: 71.00 | +[default7]: iteration 2829/ 3814 | consumed samples: 1448448 | consumed tokens: 2966421504 | elapsed time per iteration (s): 5.46 | learning rate: 1.875E-05 | global batch size: 512 | lm loss: 4.938957E+00 | loss scale: 131072.0 | grad norm: 0.121 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.808 | TFLOPs: 70.85 | +[default7]: iteration 2830/ 3814 | consumed samples: 1448960 | consumed tokens: 2967470080 | elapsed time per iteration (s): 5.45 | learning rate: 1.872E-05 | global batch size: 512 | lm loss: 4.926614E+00 | loss scale: 131072.0 | grad norm: 0.099 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.909 | TFLOPs: 70.93 | +[default7]: iteration 2831/ 3814 | consumed samples: 1449472 | consumed tokens: 2968518656 | elapsed time per iteration (s): 5.45 | learning rate: 1.868E-05 | global batch size: 512 | lm loss: 4.921464E+00 | loss scale: 131072.0 | grad norm: 0.151 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.900 | TFLOPs: 70.92 | +[default7]: iteration 2832/ 3814 | consumed samples: 1449984 | consumed tokens: 2969567232 | elapsed time per iteration (s): 5.45 | learning rate: 1.865E-05 | global batch size: 512 | lm loss: 4.929194E+00 | loss scale: 131072.0 | grad norm: 0.126 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.934 | TFLOPs: 70.95 | +[default7]: iteration 2833/ 3814 | consumed samples: 1450496 | consumed tokens: 2970615808 | elapsed time per iteration (s): 5.44 | learning rate: 1.861E-05 | global batch size: 512 | lm loss: 4.924949E+00 | loss scale: 131072.0 | grad norm: 0.128 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.032 | TFLOPs: 71.02 | +[default7]: iteration 2834/ 3814 | consumed samples: 1451008 | consumed tokens: 2971664384 | elapsed time per iteration (s): 5.46 | learning rate: 1.858E-05 | global batch size: 512 | lm loss: 4.964262E+00 | loss scale: 131072.0 | grad norm: 0.121 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.704 | TFLOPs: 70.78 | +[default7]: iteration 2835/ 3814 | consumed samples: 1451520 | consumed tokens: 2972712960 | elapsed time per iteration (s): 5.45 | learning rate: 1.854E-05 | global batch size: 512 | lm loss: 4.908269E+00 | loss scale: 131072.0 | grad norm: 0.119 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.931 | TFLOPs: 70.95 | +[default7]: iteration 2836/ 3814 | consumed samples: 1452032 | consumed tokens: 2973761536 | elapsed time per iteration (s): 5.46 | learning rate: 1.851E-05 | global batch size: 512 | lm loss: 4.931759E+00 | loss scale: 131072.0 | grad norm: 0.121 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.853 | TFLOPs: 70.89 | +[default7]: iteration 2837/ 3814 | consumed samples: 1452544 | consumed tokens: 2974810112 | elapsed time per iteration (s): 5.47 | learning rate: 1.847E-05 | global batch size: 512 | lm loss: 4.951954E+00 | loss scale: 131072.0 | grad norm: 0.103 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.568 | TFLOPs: 70.67 | +[default7]: iteration 2838/ 3814 | consumed samples: 1453056 | consumed tokens: 2975858688 | elapsed time per iteration (s): 5.44 | learning rate: 1.843E-05 | global batch size: 512 | lm loss: 4.888069E+00 | loss scale: 131072.0 | grad norm: 0.115 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.172 | TFLOPs: 71.13 | +[default7]: iteration 2839/ 3814 | consumed samples: 1453568 | consumed tokens: 2976907264 | elapsed time per iteration (s): 5.46 | learning rate: 1.840E-05 | global batch size: 512 | lm loss: 4.920809E+00 | loss scale: 131072.0 | grad norm: 0.105 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.719 | TFLOPs: 70.79 | +[default7]: iteration 2840/ 3814 | consumed samples: 1454080 | consumed tokens: 2977955840 | elapsed time per iteration (s): 5.44 | learning rate: 1.836E-05 | global batch size: 512 | lm loss: 4.910502E+00 | loss scale: 131072.0 | grad norm: 0.115 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.078 | TFLOPs: 71.06 | +[default7]: iteration 2841/ 3814 | consumed samples: 1454592 | consumed tokens: 2979004416 | elapsed time per iteration (s): 5.44 | learning rate: 1.833E-05 | global batch size: 512 | lm loss: 4.943367E+00 | loss scale: 131072.0 | grad norm: 0.104 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.199 | TFLOPs: 71.15 | +[default7]: iteration 2842/ 3814 | consumed samples: 1455104 | consumed tokens: 2980052992 | elapsed time per iteration (s): 5.44 | learning rate: 1.829E-05 | global batch size: 512 | lm loss: 4.968048E+00 | loss scale: 131072.0 | grad norm: 0.112 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.050 | TFLOPs: 71.04 | +[default7]: iteration 2843/ 3814 | consumed samples: 1455616 | consumed tokens: 2981101568 | elapsed time per iteration (s): 5.49 | learning rate: 1.826E-05 | global batch size: 512 | lm loss: 4.919245E+00 | loss scale: 131072.0 | grad norm: 0.119 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.304 | TFLOPs: 70.47 | +[default7]: iteration 2844/ 3814 | consumed samples: 1456128 | consumed tokens: 2982150144 | elapsed time per iteration (s): 5.51 | learning rate: 1.822E-05 | global batch size: 512 | lm loss: 4.920358E+00 | loss scale: 131072.0 | grad norm: 0.101 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 92.990 | TFLOPs: 70.24 | +[default7]: iteration 2845/ 3814 | consumed samples: 1456640 | consumed tokens: 2983198720 | elapsed time per iteration (s): 5.51 | learning rate: 1.819E-05 | global batch size: 512 | lm loss: 4.937951E+00 | loss scale: 131072.0 | grad norm: 0.124 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 92.916 | TFLOPs: 70.18 | +[default7]: iteration 2846/ 3814 | consumed samples: 1457152 | consumed tokens: 2984247296 | elapsed time per iteration (s): 5.50 | learning rate: 1.815E-05 | global batch size: 512 | lm loss: 4.900369E+00 | loss scale: 131072.0 | grad norm: 0.121 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.136 | TFLOPs: 70.35 | +[default7]: iteration 2847/ 3814 | consumed samples: 1457664 | consumed tokens: 2985295872 | elapsed time per iteration (s): 5.49 | learning rate: 1.812E-05 | global batch size: 512 | lm loss: 4.948953E+00 | loss scale: 131072.0 | grad norm: 0.111 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.202 | TFLOPs: 70.40 | +[default7]: iteration 2848/ 3814 | consumed samples: 1458176 | consumed tokens: 2986344448 | elapsed time per iteration (s): 5.47 | learning rate: 1.808E-05 | global batch size: 512 | lm loss: 4.947191E+00 | loss scale: 131072.0 | grad norm: 0.115 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.524 | TFLOPs: 70.64 | +[default7]: iteration 2849/ 3814 | consumed samples: 1458688 | consumed tokens: 2987393024 | elapsed time per iteration (s): 5.48 | learning rate: 1.805E-05 | global batch size: 512 | lm loss: 4.907746E+00 | loss scale: 131072.0 | grad norm: 0.097 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.411 | TFLOPs: 70.55 | +[default7]: iteration 2850/ 3814 | consumed samples: 1459200 | consumed tokens: 2988441600 | elapsed time per iteration (s): 5.48 | learning rate: 1.801E-05 | global batch size: 512 | lm loss: 4.928157E+00 | loss scale: 131072.0 | grad norm: 0.119 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.455 | TFLOPs: 70.59 | +[default7]: iteration 2851/ 3814 | consumed samples: 1459712 | consumed tokens: 2989490176 | elapsed time per iteration (s): 5.50 | learning rate: 1.798E-05 | global batch size: 512 | lm loss: 4.937436E+00 | loss scale: 131072.0 | grad norm: 0.096 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.144 | TFLOPs: 70.35 | +[default7]: iteration 2852/ 3814 | consumed samples: 1460224 | consumed tokens: 2990538752 | elapsed time per iteration (s): 5.50 | learning rate: 1.794E-05 | global batch size: 512 | lm loss: 4.947366E+00 | loss scale: 131072.0 | grad norm: 0.142 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.156 | TFLOPs: 70.36 | +[default7]: iteration 2853/ 3814 | consumed samples: 1460736 | consumed tokens: 2991587328 | elapsed time per iteration (s): 5.51 | learning rate: 1.791E-05 | global batch size: 512 | lm loss: 4.923574E+00 | loss scale: 131072.0 | grad norm: 0.107 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 92.852 | TFLOPs: 70.13 | +[default7]: iteration 2854/ 3814 | consumed samples: 1461248 | consumed tokens: 2992635904 | elapsed time per iteration (s): 5.49 | learning rate: 1.787E-05 | global batch size: 512 | lm loss: 4.916951E+00 | loss scale: 131072.0 | grad norm: 0.122 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.243 | TFLOPs: 70.43 | +[default7]: iteration 2855/ 3814 | consumed samples: 1461760 | consumed tokens: 2993684480 | elapsed time per iteration (s): 5.50 | learning rate: 1.784E-05 | global batch size: 512 | lm loss: 4.939948E+00 | loss scale: 131072.0 | grad norm: 0.116 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.013 | TFLOPs: 70.25 | +[default7]: iteration 2856/ 3814 | consumed samples: 1462272 | consumed tokens: 2994733056 | elapsed time per iteration (s): 5.49 | learning rate: 1.780E-05 | global batch size: 512 | lm loss: 4.936202E+00 | loss scale: 131072.0 | grad norm: 0.110 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.287 | TFLOPs: 70.46 | +[default7]: iteration 2857/ 3814 | consumed samples: 1462784 | consumed tokens: 2995781632 | elapsed time per iteration (s): 5.49 | learning rate: 1.777E-05 | global batch size: 512 | lm loss: 4.927880E+00 | loss scale: 131072.0 | grad norm: 0.126 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.293 | TFLOPs: 70.47 | +[default7]: iteration 2858/ 3814 | consumed samples: 1463296 | consumed tokens: 2996830208 | elapsed time per iteration (s): 5.51 | learning rate: 1.774E-05 | global batch size: 512 | lm loss: 4.940566E+00 | loss scale: 131072.0 | grad norm: 0.097 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 92.975 | TFLOPs: 70.22 | +[default7]: iteration 2859/ 3814 | consumed samples: 1463808 | consumed tokens: 2997878784 | elapsed time per iteration (s): 5.48 | learning rate: 1.770E-05 | global batch size: 512 | lm loss: 4.948548E+00 | loss scale: 131072.0 | grad norm: 0.118 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.373 | TFLOPs: 70.53 | +[default7]: iteration 2860/ 3814 | consumed samples: 1464320 | consumed tokens: 2998927360 | elapsed time per iteration (s): 5.49 | learning rate: 1.767E-05 | global batch size: 512 | lm loss: 4.935204E+00 | loss scale: 131072.0 | grad norm: 0.094 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.184 | TFLOPs: 70.38 | +[default7]: iteration 2861/ 3814 | consumed samples: 1464832 | consumed tokens: 2999975936 | elapsed time per iteration (s): 5.48 | learning rate: 1.763E-05 | global batch size: 512 | lm loss: 4.944736E+00 | loss scale: 131072.0 | grad norm: 0.119 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.442 | TFLOPs: 70.58 | +[default7]: iteration 2862/ 3814 | consumed samples: 1465344 | consumed tokens: 3001024512 | elapsed time per iteration (s): 5.50 | learning rate: 1.760E-05 | global batch size: 512 | lm loss: 4.899832E+00 | loss scale: 131072.0 | grad norm: 0.114 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.023 | TFLOPs: 70.26 | +[default7]: iteration 2863/ 3814 | consumed samples: 1465856 | consumed tokens: 3002073088 | elapsed time per iteration (s): 5.48 | learning rate: 1.756E-05 | global batch size: 512 | lm loss: 4.939784E+00 | loss scale: 131072.0 | grad norm: 0.117 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.373 | TFLOPs: 70.53 | +[default7]: iteration 2864/ 3814 | consumed samples: 1466368 | consumed tokens: 3003121664 | elapsed time per iteration (s): 5.48 | learning rate: 1.753E-05 | global batch size: 512 | lm loss: 4.922580E+00 | loss scale: 131072.0 | grad norm: 0.120 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.484 | TFLOPs: 70.61 | +[default7]: iteration 2865/ 3814 | consumed samples: 1466880 | consumed tokens: 3004170240 | elapsed time per iteration (s): 5.45 | learning rate: 1.749E-05 | global batch size: 512 | lm loss: 4.936839E+00 | loss scale: 131072.0 | grad norm: 0.141 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.867 | TFLOPs: 70.90 | +[default7]: iteration 2866/ 3814 | consumed samples: 1467392 | consumed tokens: 3005218816 | elapsed time per iteration (s): 5.49 | learning rate: 1.746E-05 | global batch size: 512 | lm loss: 4.929844E+00 | loss scale: 131072.0 | grad norm: 0.110 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.331 | TFLOPs: 70.49 | +[default7]: iteration 2867/ 3814 | consumed samples: 1467904 | consumed tokens: 3006267392 | elapsed time per iteration (s): 5.47 | learning rate: 1.742E-05 | global batch size: 512 | lm loss: 4.943112E+00 | loss scale: 131072.0 | grad norm: 0.104 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.624 | TFLOPs: 70.72 | +[default7]: iteration 2868/ 3814 | consumed samples: 1468416 | consumed tokens: 3007315968 | elapsed time per iteration (s): 5.47 | learning rate: 1.739E-05 | global batch size: 512 | lm loss: 4.905515E+00 | loss scale: 131072.0 | grad norm: 0.110 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.536 | TFLOPs: 70.65 | +[default7]: iteration 2869/ 3814 | consumed samples: 1468928 | consumed tokens: 3008364544 | elapsed time per iteration (s): 5.46 | learning rate: 1.736E-05 | global batch size: 512 | lm loss: 4.912004E+00 | loss scale: 131072.0 | grad norm: 0.104 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.743 | TFLOPs: 70.81 | +[default7]: iteration 2870/ 3814 | consumed samples: 1469440 | consumed tokens: 3009413120 | elapsed time per iteration (s): 5.47 | learning rate: 1.732E-05 | global batch size: 512 | lm loss: 4.918135E+00 | loss scale: 131072.0 | grad norm: 0.105 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.670 | TFLOPs: 70.75 | +[default7]: iteration 2871/ 3814 | consumed samples: 1469952 | consumed tokens: 3010461696 | elapsed time per iteration (s): 5.48 | learning rate: 1.729E-05 | global batch size: 512 | lm loss: 4.928819E+00 | loss scale: 131072.0 | grad norm: 0.101 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.467 | TFLOPs: 70.60 | +[default7]: iteration 2872/ 3814 | consumed samples: 1470464 | consumed tokens: 3011510272 | elapsed time per iteration (s): 5.46 | learning rate: 1.725E-05 | global batch size: 512 | lm loss: 4.923646E+00 | loss scale: 131072.0 | grad norm: 0.098 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.834 | TFLOPs: 70.87 | +[default7]: iteration 2873/ 3814 | consumed samples: 1470976 | consumed tokens: 3012558848 | elapsed time per iteration (s): 5.46 | learning rate: 1.722E-05 | global batch size: 512 | lm loss: 4.954929E+00 | loss scale: 131072.0 | grad norm: 0.100 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.738 | TFLOPs: 70.80 | +[default7]: iteration 2874/ 3814 | consumed samples: 1471488 | consumed tokens: 3013607424 | elapsed time per iteration (s): 5.46 | learning rate: 1.718E-05 | global batch size: 512 | lm loss: 4.938239E+00 | loss scale: 131072.0 | grad norm: 0.108 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.689 | TFLOPs: 70.76 | +[default7]: iteration 2875/ 3814 | consumed samples: 1472000 | consumed tokens: 3014656000 | elapsed time per iteration (s): 5.47 | learning rate: 1.715E-05 | global batch size: 512 | lm loss: 4.936963E+00 | loss scale: 131072.0 | grad norm: 0.111 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.658 | TFLOPs: 70.74 | +[default7]: iteration 2876/ 3814 | consumed samples: 1472512 | consumed tokens: 3015704576 | elapsed time per iteration (s): 5.47 | learning rate: 1.712E-05 | global batch size: 512 | lm loss: 4.949780E+00 | loss scale: 131072.0 | grad norm: 0.111 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.582 | TFLOPs: 70.68 | +[default7]: iteration 2877/ 3814 | consumed samples: 1473024 | consumed tokens: 3016753152 | elapsed time per iteration (s): 5.47 | learning rate: 1.708E-05 | global batch size: 512 | lm loss: 4.926065E+00 | loss scale: 131072.0 | grad norm: 0.095 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.588 | TFLOPs: 70.69 | +[default7]: iteration 2878/ 3814 | consumed samples: 1473536 | consumed tokens: 3017801728 | elapsed time per iteration (s): 5.43 | learning rate: 1.705E-05 | global batch size: 512 | lm loss: 4.940857E+00 | loss scale: 131072.0 | grad norm: 0.106 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.225 | TFLOPs: 71.17 | +[default7]: iteration 2879/ 3814 | consumed samples: 1474048 | consumed tokens: 3018850304 | elapsed time per iteration (s): 5.47 | learning rate: 1.701E-05 | global batch size: 512 | lm loss: 4.913507E+00 | loss scale: 131072.0 | grad norm: 0.090 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.685 | TFLOPs: 70.76 | +[default7]: iteration 2880/ 3814 | consumed samples: 1474560 | consumed tokens: 3019898880 | elapsed time per iteration (s): 5.45 | learning rate: 1.698E-05 | global batch size: 512 | lm loss: 4.925410E+00 | loss scale: 131072.0 | grad norm: 0.101 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.914 | TFLOPs: 70.93 | +[default7]: iteration 2881/ 3814 | consumed samples: 1475072 | consumed tokens: 3020947456 | elapsed time per iteration (s): 5.44 | learning rate: 1.694E-05 | global batch size: 512 | lm loss: 4.939539E+00 | loss scale: 131072.0 | grad norm: 0.101 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.055 | TFLOPs: 71.04 | +[default7]: iteration 2882/ 3814 | consumed samples: 1475584 | consumed tokens: 3021996032 | elapsed time per iteration (s): 5.48 | learning rate: 1.691E-05 | global batch size: 512 | lm loss: 4.922110E+00 | loss scale: 131072.0 | grad norm: 0.102 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.452 | TFLOPs: 70.59 | +[default7]: iteration 2883/ 3814 | consumed samples: 1476096 | consumed tokens: 3023044608 | elapsed time per iteration (s): 5.45 | learning rate: 1.688E-05 | global batch size: 512 | lm loss: 4.933686E+00 | loss scale: 131072.0 | grad norm: 0.102 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.973 | TFLOPs: 70.98 | +[default7]: iteration 2884/ 3814 | consumed samples: 1476608 | consumed tokens: 3024093184 | elapsed time per iteration (s): 5.45 | learning rate: 1.684E-05 | global batch size: 512 | lm loss: 4.930764E+00 | loss scale: 131072.0 | grad norm: 0.105 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.998 | TFLOPs: 71.00 | +[default7]: iteration 2885/ 3814 | consumed samples: 1477120 | consumed tokens: 3025141760 | elapsed time per iteration (s): 5.45 | learning rate: 1.681E-05 | global batch size: 512 | lm loss: 4.910392E+00 | loss scale: 131072.0 | grad norm: 0.095 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.971 | TFLOPs: 70.98 | +[default7]: iteration 2886/ 3814 | consumed samples: 1477632 | consumed tokens: 3026190336 | elapsed time per iteration (s): 5.45 | learning rate: 1.677E-05 | global batch size: 512 | lm loss: 4.889772E+00 | loss scale: 131072.0 | grad norm: 0.096 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.872 | TFLOPs: 70.90 | +[default7]: iteration 2887/ 3814 | consumed samples: 1478144 | consumed tokens: 3027238912 | elapsed time per iteration (s): 5.48 | learning rate: 1.674E-05 | global batch size: 512 | lm loss: 4.945220E+00 | loss scale: 131072.0 | grad norm: 0.102 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.377 | TFLOPs: 70.53 | +[default7]: iteration 2888/ 3814 | consumed samples: 1478656 | consumed tokens: 3028287488 | elapsed time per iteration (s): 5.51 | learning rate: 1.671E-05 | global batch size: 512 | lm loss: 4.925982E+00 | loss scale: 131072.0 | grad norm: 0.104 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 92.959 | TFLOPs: 70.21 | +[default7]: iteration 2889/ 3814 | consumed samples: 1479168 | consumed tokens: 3029336064 | elapsed time per iteration (s): 5.49 | learning rate: 1.667E-05 | global batch size: 512 | lm loss: 4.923630E+00 | loss scale: 131072.0 | grad norm: 0.096 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.306 | TFLOPs: 70.47 | +[default7]: iteration 2890/ 3814 | consumed samples: 1479680 | consumed tokens: 3030384640 | elapsed time per iteration (s): 5.46 | learning rate: 1.664E-05 | global batch size: 512 | lm loss: 4.919209E+00 | loss scale: 131072.0 | grad norm: 0.110 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.771 | TFLOPs: 70.83 | +[default7]: iteration 2891/ 3814 | consumed samples: 1480192 | consumed tokens: 3031433216 | elapsed time per iteration (s): 5.48 | learning rate: 1.660E-05 | global batch size: 512 | lm loss: 4.929988E+00 | loss scale: 131072.0 | grad norm: 0.107 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.401 | TFLOPs: 70.55 | +[default7]: iteration 2892/ 3814 | consumed samples: 1480704 | consumed tokens: 3032481792 | elapsed time per iteration (s): 5.45 | learning rate: 1.657E-05 | global batch size: 512 | lm loss: 4.925477E+00 | loss scale: 131072.0 | grad norm: 0.096 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.867 | TFLOPs: 70.90 | +[default7]: iteration 2893/ 3814 | consumed samples: 1481216 | consumed tokens: 3033530368 | elapsed time per iteration (s): 5.45 | learning rate: 1.654E-05 | global batch size: 512 | lm loss: 4.905919E+00 | loss scale: 131072.0 | grad norm: 0.114 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.867 | TFLOPs: 70.90 | +[default7]: iteration 2894/ 3814 | consumed samples: 1481728 | consumed tokens: 3034578944 | elapsed time per iteration (s): 5.49 | learning rate: 1.650E-05 | global batch size: 512 | lm loss: 4.961928E+00 | loss scale: 131072.0 | grad norm: 0.115 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.340 | TFLOPs: 70.50 | +[default7]: iteration 2895/ 3814 | consumed samples: 1482240 | consumed tokens: 3035627520 | elapsed time per iteration (s): 5.46 | learning rate: 1.647E-05 | global batch size: 512 | lm loss: 4.942277E+00 | loss scale: 131072.0 | grad norm: 0.118 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.741 | TFLOPs: 70.80 | +[default7]: iteration 2896/ 3814 | consumed samples: 1482752 | consumed tokens: 3036676096 | elapsed time per iteration (s): 5.46 | learning rate: 1.644E-05 | global batch size: 512 | lm loss: 4.923624E+00 | loss scale: 131072.0 | grad norm: 0.091 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.775 | TFLOPs: 70.83 | +[default7]: iteration 2897/ 3814 | consumed samples: 1483264 | consumed tokens: 3037724672 | elapsed time per iteration (s): 5.46 | learning rate: 1.640E-05 | global batch size: 512 | lm loss: 4.953377E+00 | loss scale: 131072.0 | grad norm: 0.114 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.731 | TFLOPs: 70.80 | +[default7]: iteration 2898/ 3814 | consumed samples: 1483776 | consumed tokens: 3038773248 | elapsed time per iteration (s): 5.47 | learning rate: 1.637E-05 | global batch size: 512 | lm loss: 4.924223E+00 | loss scale: 131072.0 | grad norm: 0.098 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.684 | TFLOPs: 70.76 | +[default7]: iteration 2899/ 3814 | consumed samples: 1484288 | consumed tokens: 3039821824 | elapsed time per iteration (s): 5.48 | learning rate: 1.634E-05 | global batch size: 512 | lm loss: 4.937704E+00 | loss scale: 131072.0 | grad norm: 0.096 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.487 | TFLOPs: 70.61 | +[default7]: iteration 2900/ 3814 | consumed samples: 1484800 | consumed tokens: 3040870400 | elapsed time per iteration (s): 5.46 | learning rate: 1.630E-05 | global batch size: 512 | lm loss: 4.904630E+00 | loss scale: 131072.0 | grad norm: 0.108 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.723 | TFLOPs: 70.79 | +[default7]: iteration 2901/ 3814 | consumed samples: 1485312 | consumed tokens: 3041918976 | elapsed time per iteration (s): 5.46 | learning rate: 1.627E-05 | global batch size: 512 | lm loss: 4.924174E+00 | loss scale: 131072.0 | grad norm: 0.095 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.710 | TFLOPs: 70.78 | +[default7]: iteration 2902/ 3814 | consumed samples: 1485824 | consumed tokens: 3042967552 | elapsed time per iteration (s): 5.47 | learning rate: 1.623E-05 | global batch size: 512 | lm loss: 4.929523E+00 | loss scale: 131072.0 | grad norm: 0.113 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.661 | TFLOPs: 70.74 | +[default7]: iteration 2903/ 3814 | consumed samples: 1486336 | consumed tokens: 3044016128 | elapsed time per iteration (s): 5.47 | learning rate: 1.620E-05 | global batch size: 512 | lm loss: 4.922709E+00 | loss scale: 131072.0 | grad norm: 0.100 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.580 | TFLOPs: 70.68 | +[default7]: iteration 2904/ 3814 | consumed samples: 1486848 | consumed tokens: 3045064704 | elapsed time per iteration (s): 5.45 | learning rate: 1.617E-05 | global batch size: 512 | lm loss: 4.926269E+00 | loss scale: 131072.0 | grad norm: 0.115 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.985 | TFLOPs: 70.99 | +[default7]: iteration 2905/ 3814 | consumed samples: 1487360 | consumed tokens: 3046113280 | elapsed time per iteration (s): 5.45 | learning rate: 1.613E-05 | global batch size: 512 | lm loss: 4.934804E+00 | loss scale: 131072.0 | grad norm: 0.098 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.937 | TFLOPs: 70.95 | +[default7]: iteration 2906/ 3814 | consumed samples: 1487872 | consumed tokens: 3047161856 | elapsed time per iteration (s): 5.47 | learning rate: 1.610E-05 | global batch size: 512 | lm loss: 4.909154E+00 | loss scale: 131072.0 | grad norm: 0.123 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.578 | TFLOPs: 70.68 | +[default7]: iteration 2907/ 3814 | consumed samples: 1488384 | consumed tokens: 3048210432 | elapsed time per iteration (s): 5.45 | learning rate: 1.607E-05 | global batch size: 512 | lm loss: 4.941099E+00 | loss scale: 131072.0 | grad norm: 0.134 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.999 | TFLOPs: 71.00 | +[default7]: iteration 2908/ 3814 | consumed samples: 1488896 | consumed tokens: 3049259008 | elapsed time per iteration (s): 5.45 | learning rate: 1.603E-05 | global batch size: 512 | lm loss: 4.968219E+00 | loss scale: 131072.0 | grad norm: 0.126 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.010 | TFLOPs: 71.01 | +[default7]: iteration 2909/ 3814 | consumed samples: 1489408 | consumed tokens: 3050307584 | elapsed time per iteration (s): 5.48 | learning rate: 1.600E-05 | global batch size: 512 | lm loss: 4.927263E+00 | loss scale: 131072.0 | grad norm: 0.121 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.483 | TFLOPs: 70.61 | +[default7]: iteration 2910/ 3814 | consumed samples: 1489920 | consumed tokens: 3051356160 | elapsed time per iteration (s): 5.45 | learning rate: 1.597E-05 | global batch size: 512 | lm loss: 4.922312E+00 | loss scale: 131072.0 | grad norm: 0.133 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.951 | TFLOPs: 70.96 | +[default7]: iteration 2911/ 3814 | consumed samples: 1490432 | consumed tokens: 3052404736 | elapsed time per iteration (s): 5.44 | learning rate: 1.593E-05 | global batch size: 512 | lm loss: 4.922726E+00 | loss scale: 131072.0 | grad norm: 0.120 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.082 | TFLOPs: 71.06 | +[default7]: iteration 2912/ 3814 | consumed samples: 1490944 | consumed tokens: 3053453312 | elapsed time per iteration (s): 5.46 | learning rate: 1.590E-05 | global batch size: 512 | lm loss: 4.928871E+00 | loss scale: 131072.0 | grad norm: 0.102 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.784 | TFLOPs: 70.84 | +[default7]: iteration 2913/ 3814 | consumed samples: 1491456 | consumed tokens: 3054501888 | elapsed time per iteration (s): 5.44 | learning rate: 1.587E-05 | global batch size: 512 | lm loss: 4.934693E+00 | loss scale: 131072.0 | grad norm: 0.114 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.128 | TFLOPs: 71.10 | +[default7]: iteration 2914/ 3814 | consumed samples: 1491968 | consumed tokens: 3055550464 | elapsed time per iteration (s): 5.45 | learning rate: 1.583E-05 | global batch size: 512 | lm loss: 4.915496E+00 | loss scale: 131072.0 | grad norm: 0.112 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.944 | TFLOPs: 70.96 | +[default7]: iteration 2915/ 3814 | consumed samples: 1492480 | consumed tokens: 3056599040 | elapsed time per iteration (s): 5.46 | learning rate: 1.580E-05 | global batch size: 512 | lm loss: 4.908684E+00 | loss scale: 131072.0 | grad norm: 0.112 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.720 | TFLOPs: 70.79 | +[default7]: iteration 2916/ 3814 | consumed samples: 1492992 | consumed tokens: 3057647616 | elapsed time per iteration (s): 5.45 | learning rate: 1.577E-05 | global batch size: 512 | lm loss: 4.921475E+00 | loss scale: 131072.0 | grad norm: 0.106 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.973 | TFLOPs: 70.98 | +[default7]: iteration 2917/ 3814 | consumed samples: 1493504 | consumed tokens: 3058696192 | elapsed time per iteration (s): 5.47 | learning rate: 1.573E-05 | global batch size: 512 | lm loss: 4.924382E+00 | loss scale: 131072.0 | grad norm: 0.114 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.611 | TFLOPs: 70.71 | +[default7]: iteration 2918/ 3814 | consumed samples: 1494016 | consumed tokens: 3059744768 | elapsed time per iteration (s): 5.45 | learning rate: 1.570E-05 | global batch size: 512 | lm loss: 4.903320E+00 | loss scale: 131072.0 | grad norm: 0.089 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.021 | TFLOPs: 71.02 | +[default7]: iteration 2919/ 3814 | consumed samples: 1494528 | consumed tokens: 3060793344 | elapsed time per iteration (s): 5.44 | learning rate: 1.567E-05 | global batch size: 512 | lm loss: 4.915483E+00 | loss scale: 131072.0 | grad norm: 0.116 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.097 | TFLOPs: 71.07 | +[default7]: iteration 2920/ 3814 | consumed samples: 1495040 | consumed tokens: 3061841920 | elapsed time per iteration (s): 5.46 | learning rate: 1.564E-05 | global batch size: 512 | lm loss: 4.894407E+00 | loss scale: 131072.0 | grad norm: 0.084 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.857 | TFLOPs: 70.89 | +[default7]: iteration 2921/ 3814 | consumed samples: 1495552 | consumed tokens: 3062890496 | elapsed time per iteration (s): 5.45 | learning rate: 1.560E-05 | global batch size: 512 | lm loss: 4.946178E+00 | loss scale: 131072.0 | grad norm: 0.100 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.000 | TFLOPs: 71.00 | +[default7]: iteration 2922/ 3814 | consumed samples: 1496064 | consumed tokens: 3063939072 | elapsed time per iteration (s): 5.46 | learning rate: 1.557E-05 | global batch size: 512 | lm loss: 4.936790E+00 | loss scale: 131072.0 | grad norm: 0.100 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.706 | TFLOPs: 70.78 | +[default7]: iteration 2923/ 3814 | consumed samples: 1496576 | consumed tokens: 3064987648 | elapsed time per iteration (s): 5.47 | learning rate: 1.554E-05 | global batch size: 512 | lm loss: 4.938633E+00 | loss scale: 131072.0 | grad norm: 0.104 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.554 | TFLOPs: 70.66 | +[default7]: iteration 2924/ 3814 | consumed samples: 1497088 | consumed tokens: 3066036224 | elapsed time per iteration (s): 5.46 | learning rate: 1.550E-05 | global batch size: 512 | lm loss: 4.936730E+00 | loss scale: 131072.0 | grad norm: 0.092 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.828 | TFLOPs: 70.87 | +[default7]: iteration 2925/ 3814 | consumed samples: 1497600 | consumed tokens: 3067084800 | elapsed time per iteration (s): 5.47 | learning rate: 1.547E-05 | global batch size: 512 | lm loss: 4.905755E+00 | loss scale: 131072.0 | grad norm: 0.107 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.592 | TFLOPs: 70.69 | +[default7]: iteration 2926/ 3814 | consumed samples: 1498112 | consumed tokens: 3068133376 | elapsed time per iteration (s): 5.45 | learning rate: 1.544E-05 | global batch size: 512 | lm loss: 4.905930E+00 | loss scale: 131072.0 | grad norm: 0.090 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.869 | TFLOPs: 70.90 | +[default7]: iteration 2927/ 3814 | consumed samples: 1498624 | consumed tokens: 3069181952 | elapsed time per iteration (s): 5.45 | learning rate: 1.541E-05 | global batch size: 512 | lm loss: 4.928586E+00 | loss scale: 131072.0 | grad norm: 0.106 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.914 | TFLOPs: 70.93 | +[default7]: iteration 2928/ 3814 | consumed samples: 1499136 | consumed tokens: 3070230528 | elapsed time per iteration (s): 5.44 | learning rate: 1.537E-05 | global batch size: 512 | lm loss: 4.892801E+00 | loss scale: 131072.0 | grad norm: 0.144 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.127 | TFLOPs: 71.10 | +[default7]: iteration 2929/ 3814 | consumed samples: 1499648 | consumed tokens: 3071279104 | elapsed time per iteration (s): 5.44 | learning rate: 1.534E-05 | global batch size: 512 | lm loss: 4.932197E+00 | loss scale: 131072.0 | grad norm: 0.116 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.032 | TFLOPs: 71.02 | +[default7]: iteration 2930/ 3814 | consumed samples: 1500160 | consumed tokens: 3072327680 | elapsed time per iteration (s): 5.45 | learning rate: 1.531E-05 | global batch size: 512 | lm loss: 4.914823E+00 | loss scale: 131072.0 | grad norm: 0.097 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.972 | TFLOPs: 70.98 | +[default7]: iteration 2931/ 3814 | consumed samples: 1500672 | consumed tokens: 3073376256 | elapsed time per iteration (s): 5.50 | learning rate: 1.527E-05 | global batch size: 512 | lm loss: 4.892153E+00 | loss scale: 131072.0 | grad norm: 0.127 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.112 | TFLOPs: 70.33 | +[default7]: iteration 2932/ 3814 | consumed samples: 1501184 | consumed tokens: 3074424832 | elapsed time per iteration (s): 5.44 | learning rate: 1.524E-05 | global batch size: 512 | lm loss: 4.943766E+00 | loss scale: 131072.0 | grad norm: 0.106 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.056 | TFLOPs: 71.04 | +[default7]: iteration 2933/ 3814 | consumed samples: 1501696 | consumed tokens: 3075473408 | elapsed time per iteration (s): 5.46 | learning rate: 1.521E-05 | global batch size: 512 | lm loss: 4.927283E+00 | loss scale: 131072.0 | grad norm: 0.096 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.690 | TFLOPs: 70.77 | +[default7]: iteration 2934/ 3814 | consumed samples: 1502208 | consumed tokens: 3076521984 | elapsed time per iteration (s): 5.47 | learning rate: 1.518E-05 | global batch size: 512 | lm loss: 4.909650E+00 | loss scale: 131072.0 | grad norm: 0.109 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.545 | TFLOPs: 70.66 | +[default7]: iteration 2935/ 3814 | consumed samples: 1502720 | consumed tokens: 3077570560 | elapsed time per iteration (s): 5.46 | learning rate: 1.514E-05 | global batch size: 512 | lm loss: 4.925519E+00 | loss scale: 131072.0 | grad norm: 0.097 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.794 | TFLOPs: 70.84 | +[default7]: iteration 2936/ 3814 | consumed samples: 1503232 | consumed tokens: 3078619136 | elapsed time per iteration (s): 5.49 | learning rate: 1.511E-05 | global batch size: 512 | lm loss: 4.905052E+00 | loss scale: 131072.0 | grad norm: 0.097 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.310 | TFLOPs: 70.48 | +[default7]: iteration 2937/ 3814 | consumed samples: 1503744 | consumed tokens: 3079667712 | elapsed time per iteration (s): 5.47 | learning rate: 1.508E-05 | global batch size: 512 | lm loss: 4.897516E+00 | loss scale: 131072.0 | grad norm: 0.104 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.570 | TFLOPs: 70.67 | +[default7]: iteration 2938/ 3814 | consumed samples: 1504256 | consumed tokens: 3080716288 | elapsed time per iteration (s): 5.46 | learning rate: 1.505E-05 | global batch size: 512 | lm loss: 4.920185E+00 | loss scale: 131072.0 | grad norm: 0.092 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.822 | TFLOPs: 70.86 | +[default7]: iteration 2939/ 3814 | consumed samples: 1504768 | consumed tokens: 3081764864 | elapsed time per iteration (s): 5.45 | learning rate: 1.501E-05 | global batch size: 512 | lm loss: 4.919811E+00 | loss scale: 131072.0 | grad norm: 0.117 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.955 | TFLOPs: 70.96 | +[default7]: iteration 2940/ 3814 | consumed samples: 1505280 | consumed tokens: 3082813440 | elapsed time per iteration (s): 5.46 | learning rate: 1.498E-05 | global batch size: 512 | lm loss: 4.917050E+00 | loss scale: 131072.0 | grad norm: 0.108 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.806 | TFLOPs: 70.85 | +[default7]: iteration 2941/ 3814 | consumed samples: 1505792 | consumed tokens: 3083862016 | elapsed time per iteration (s): 5.44 | learning rate: 1.495E-05 | global batch size: 512 | lm loss: 4.914556E+00 | loss scale: 131072.0 | grad norm: 0.094 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.082 | TFLOPs: 71.06 | +[default7]: iteration 2942/ 3814 | consumed samples: 1506304 | consumed tokens: 3084910592 | elapsed time per iteration (s): 5.45 | learning rate: 1.492E-05 | global batch size: 512 | lm loss: 4.909769E+00 | loss scale: 131072.0 | grad norm: 0.129 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.998 | TFLOPs: 71.00 | +[default7]: iteration 2943/ 3814 | consumed samples: 1506816 | consumed tokens: 3085959168 | elapsed time per iteration (s): 5.45 | learning rate: 1.488E-05 | global batch size: 512 | lm loss: 4.924101E+00 | loss scale: 131072.0 | grad norm: 0.091 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.010 | TFLOPs: 71.01 | +[default7]: iteration 2944/ 3814 | consumed samples: 1507328 | consumed tokens: 3087007744 | elapsed time per iteration (s): 5.45 | learning rate: 1.485E-05 | global batch size: 512 | lm loss: 4.895560E+00 | loss scale: 131072.0 | grad norm: 0.133 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.009 | TFLOPs: 71.01 | +[default7]: iteration 2945/ 3814 | consumed samples: 1507840 | consumed tokens: 3088056320 | elapsed time per iteration (s): 5.44 | learning rate: 1.482E-05 | global batch size: 512 | lm loss: 4.909452E+00 | loss scale: 131072.0 | grad norm: 0.112 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.154 | TFLOPs: 71.12 | +[default7]: iteration 2946/ 3814 | consumed samples: 1508352 | consumed tokens: 3089104896 | elapsed time per iteration (s): 5.43 | learning rate: 1.479E-05 | global batch size: 512 | lm loss: 4.892438E+00 | loss scale: 131072.0 | grad norm: 0.118 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.301 | TFLOPs: 71.23 | +[default7]: iteration 2947/ 3814 | consumed samples: 1508864 | consumed tokens: 3090153472 | elapsed time per iteration (s): 5.42 | learning rate: 1.476E-05 | global batch size: 512 | lm loss: 4.921845E+00 | loss scale: 131072.0 | grad norm: 0.121 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.418 | TFLOPs: 71.32 | +[default7]: iteration 2948/ 3814 | consumed samples: 1509376 | consumed tokens: 3091202048 | elapsed time per iteration (s): 5.46 | learning rate: 1.472E-05 | global batch size: 512 | lm loss: 4.943553E+00 | loss scale: 131072.0 | grad norm: 0.099 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.754 | TFLOPs: 70.81 | +[default7]: iteration 2949/ 3814 | consumed samples: 1509888 | consumed tokens: 3092250624 | elapsed time per iteration (s): 5.43 | learning rate: 1.469E-05 | global batch size: 512 | lm loss: 4.905015E+00 | loss scale: 131072.0 | grad norm: 0.148 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.223 | TFLOPs: 71.17 | +[default7]: iteration 2950/ 3814 | consumed samples: 1510400 | consumed tokens: 3093299200 | elapsed time per iteration (s): 5.44 | learning rate: 1.466E-05 | global batch size: 512 | lm loss: 4.920453E+00 | loss scale: 131072.0 | grad norm: 0.099 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.172 | TFLOPs: 71.13 | +[default7]: iteration 2951/ 3814 | consumed samples: 1510912 | consumed tokens: 3094347776 | elapsed time per iteration (s): 5.44 | learning rate: 1.463E-05 | global batch size: 512 | lm loss: 4.920248E+00 | loss scale: 131072.0 | grad norm: 0.160 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.134 | TFLOPs: 71.10 | +[default7]: iteration 2952/ 3814 | consumed samples: 1511424 | consumed tokens: 3095396352 | elapsed time per iteration (s): 5.44 | learning rate: 1.459E-05 | global batch size: 512 | lm loss: 4.920037E+00 | loss scale: 131072.0 | grad norm: 0.128 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.038 | TFLOPs: 71.03 | +[default7]: iteration 2953/ 3814 | consumed samples: 1511936 | consumed tokens: 3096444928 | elapsed time per iteration (s): 5.44 | learning rate: 1.456E-05 | global batch size: 512 | lm loss: 4.966977E+00 | loss scale: 131072.0 | grad norm: 0.140 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.101 | TFLOPs: 71.08 | +[default7]: iteration 2954/ 3814 | consumed samples: 1512448 | consumed tokens: 3097493504 | elapsed time per iteration (s): 5.42 | learning rate: 1.453E-05 | global batch size: 512 | lm loss: 4.900006E+00 | loss scale: 131072.0 | grad norm: 0.117 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.465 | TFLOPs: 71.35 | +[default7]: iteration 2955/ 3814 | consumed samples: 1512960 | consumed tokens: 3098542080 | elapsed time per iteration (s): 5.46 | learning rate: 1.450E-05 | global batch size: 512 | lm loss: 4.925599E+00 | loss scale: 131072.0 | grad norm: 0.101 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.824 | TFLOPs: 70.87 | +[default7]: iteration 2956/ 3814 | consumed samples: 1513472 | consumed tokens: 3099590656 | elapsed time per iteration (s): 5.43 | learning rate: 1.447E-05 | global batch size: 512 | lm loss: 4.909141E+00 | loss scale: 131072.0 | grad norm: 0.115 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.286 | TFLOPs: 71.21 | +[default7]: iteration 2957/ 3814 | consumed samples: 1513984 | consumed tokens: 3100639232 | elapsed time per iteration (s): 5.45 | learning rate: 1.443E-05 | global batch size: 512 | lm loss: 4.900474E+00 | loss scale: 131072.0 | grad norm: 0.149 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.983 | TFLOPs: 70.99 | +[default7]: iteration 2958/ 3814 | consumed samples: 1514496 | consumed tokens: 3101687808 | elapsed time per iteration (s): 5.45 | learning rate: 1.440E-05 | global batch size: 512 | lm loss: 4.919652E+00 | loss scale: 131072.0 | grad norm: 0.108 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.974 | TFLOPs: 70.98 | +[default7]: iteration 2959/ 3814 | consumed samples: 1515008 | consumed tokens: 3102736384 | elapsed time per iteration (s): 5.43 | learning rate: 1.437E-05 | global batch size: 512 | lm loss: 4.928222E+00 | loss scale: 131072.0 | grad norm: 0.134 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.346 | TFLOPs: 71.26 | +[default7]: iteration 2960/ 3814 | consumed samples: 1515520 | consumed tokens: 3103784960 | elapsed time per iteration (s): 5.43 | learning rate: 1.434E-05 | global batch size: 512 | lm loss: 4.927141E+00 | loss scale: 131072.0 | grad norm: 0.110 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.257 | TFLOPs: 71.19 | +[default7]: iteration 2961/ 3814 | consumed samples: 1516032 | consumed tokens: 3104833536 | elapsed time per iteration (s): 5.43 | learning rate: 1.431E-05 | global batch size: 512 | lm loss: 4.895174E+00 | loss scale: 131072.0 | grad norm: 0.154 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.321 | TFLOPs: 71.24 | +[default7]: iteration 2962/ 3814 | consumed samples: 1516544 | consumed tokens: 3105882112 | elapsed time per iteration (s): 5.44 | learning rate: 1.427E-05 | global batch size: 512 | lm loss: 4.904191E+00 | loss scale: 131072.0 | grad norm: 0.116 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.180 | TFLOPs: 71.14 | +[default7]: iteration 2963/ 3814 | consumed samples: 1517056 | consumed tokens: 3106930688 | elapsed time per iteration (s): 5.45 | learning rate: 1.424E-05 | global batch size: 512 | lm loss: 4.906175E+00 | loss scale: 131072.0 | grad norm: 0.113 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.970 | TFLOPs: 70.98 | +[default7]: iteration 2964/ 3814 | consumed samples: 1517568 | consumed tokens: 3107979264 | elapsed time per iteration (s): 5.46 | learning rate: 1.421E-05 | global batch size: 512 | lm loss: 4.928003E+00 | loss scale: 131072.0 | grad norm: 0.107 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.734 | TFLOPs: 70.80 | +[default7]: iteration 2965/ 3814 | consumed samples: 1518080 | consumed tokens: 3109027840 | elapsed time per iteration (s): 5.45 | learning rate: 1.418E-05 | global batch size: 512 | lm loss: 4.895422E+00 | loss scale: 131072.0 | grad norm: 0.107 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.961 | TFLOPs: 70.97 | +[default7]: iteration 2966/ 3814 | consumed samples: 1518592 | consumed tokens: 3110076416 | elapsed time per iteration (s): 5.46 | learning rate: 1.415E-05 | global batch size: 512 | lm loss: 4.943152E+00 | loss scale: 131072.0 | grad norm: 0.105 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.837 | TFLOPs: 70.88 | +[default7]: iteration 2967/ 3814 | consumed samples: 1519104 | consumed tokens: 3111124992 | elapsed time per iteration (s): 5.46 | learning rate: 1.412E-05 | global batch size: 512 | lm loss: 4.921082E+00 | loss scale: 131072.0 | grad norm: 0.102 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.706 | TFLOPs: 70.78 | +[default7]: iteration 2968/ 3814 | consumed samples: 1519616 | consumed tokens: 3112173568 | elapsed time per iteration (s): 5.46 | learning rate: 1.408E-05 | global batch size: 512 | lm loss: 4.926886E+00 | loss scale: 131072.0 | grad norm: 0.107 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.855 | TFLOPs: 70.89 | +[default7]: iteration 2969/ 3814 | consumed samples: 1520128 | consumed tokens: 3113222144 | elapsed time per iteration (s): 5.43 | learning rate: 1.405E-05 | global batch size: 512 | lm loss: 4.929275E+00 | loss scale: 131072.0 | grad norm: 0.106 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.205 | TFLOPs: 71.15 | +[default7]: iteration 2970/ 3814 | consumed samples: 1520640 | consumed tokens: 3114270720 | elapsed time per iteration (s): 5.44 | learning rate: 1.402E-05 | global batch size: 512 | lm loss: 4.898437E+00 | loss scale: 131072.0 | grad norm: 0.094 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.159 | TFLOPs: 71.12 | +[default7]: iteration 2971/ 3814 | consumed samples: 1521152 | consumed tokens: 3115319296 | elapsed time per iteration (s): 5.41 | learning rate: 1.399E-05 | global batch size: 512 | lm loss: 4.896992E+00 | loss scale: 131072.0 | grad norm: 0.101 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.674 | TFLOPs: 71.51 | +[default7]: iteration 2972/ 3814 | consumed samples: 1521664 | consumed tokens: 3116367872 | elapsed time per iteration (s): 5.46 | learning rate: 1.396E-05 | global batch size: 512 | lm loss: 4.899395E+00 | loss scale: 131072.0 | grad norm: 0.101 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.693 | TFLOPs: 70.77 | +[default7]: iteration 2973/ 3814 | consumed samples: 1522176 | consumed tokens: 3117416448 | elapsed time per iteration (s): 5.44 | learning rate: 1.393E-05 | global batch size: 512 | lm loss: 4.909495E+00 | loss scale: 131072.0 | grad norm: 0.111 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.075 | TFLOPs: 71.06 | +[default7]: iteration 2974/ 3814 | consumed samples: 1522688 | consumed tokens: 3118465024 | elapsed time per iteration (s): 5.45 | learning rate: 1.390E-05 | global batch size: 512 | lm loss: 4.931390E+00 | loss scale: 131072.0 | grad norm: 0.105 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.014 | TFLOPs: 71.01 | +[default7]: iteration 2975/ 3814 | consumed samples: 1523200 | consumed tokens: 3119513600 | elapsed time per iteration (s): 5.43 | learning rate: 1.386E-05 | global batch size: 512 | lm loss: 4.915209E+00 | loss scale: 131072.0 | grad norm: 0.096 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.246 | TFLOPs: 71.18 | +[default7]: iteration 2976/ 3814 | consumed samples: 1523712 | consumed tokens: 3120562176 | elapsed time per iteration (s): 5.44 | learning rate: 1.383E-05 | global batch size: 512 | lm loss: 4.918169E+00 | loss scale: 131072.0 | grad norm: 0.098 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.150 | TFLOPs: 71.11 | +[default7]: iteration 2977/ 3814 | consumed samples: 1524224 | consumed tokens: 3121610752 | elapsed time per iteration (s): 5.43 | learning rate: 1.380E-05 | global batch size: 512 | lm loss: 4.887976E+00 | loss scale: 131072.0 | grad norm: 0.098 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.348 | TFLOPs: 71.26 | +[default7]: iteration 2978/ 3814 | consumed samples: 1524736 | consumed tokens: 3122659328 | elapsed time per iteration (s): 5.42 | learning rate: 1.377E-05 | global batch size: 512 | lm loss: 4.897407E+00 | loss scale: 131072.0 | grad norm: 0.095 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.494 | TFLOPs: 71.37 | +[default7]: iteration 2979/ 3814 | consumed samples: 1525248 | consumed tokens: 3123707904 | elapsed time per iteration (s): 5.44 | learning rate: 1.374E-05 | global batch size: 512 | lm loss: 4.949199E+00 | loss scale: 131072.0 | grad norm: 0.106 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.045 | TFLOPs: 71.03 | +[default7]: iteration 2980/ 3814 | consumed samples: 1525760 | consumed tokens: 3124756480 | elapsed time per iteration (s): 5.43 | learning rate: 1.371E-05 | global batch size: 512 | lm loss: 4.912755E+00 | loss scale: 131072.0 | grad norm: 0.100 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.281 | TFLOPs: 71.21 | +[default7]: iteration 2981/ 3814 | consumed samples: 1526272 | consumed tokens: 3125805056 | elapsed time per iteration (s): 5.40 | learning rate: 1.368E-05 | global batch size: 512 | lm loss: 4.905756E+00 | loss scale: 131072.0 | grad norm: 0.103 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.791 | TFLOPs: 71.60 | +[default7]: iteration 2982/ 3814 | consumed samples: 1526784 | consumed tokens: 3126853632 | elapsed time per iteration (s): 5.42 | learning rate: 1.365E-05 | global batch size: 512 | lm loss: 4.899488E+00 | loss scale: 131072.0 | grad norm: 0.103 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.442 | TFLOPs: 71.33 | +[default7]: iteration 2983/ 3814 | consumed samples: 1527296 | consumed tokens: 3127902208 | elapsed time per iteration (s): 5.42 | learning rate: 1.361E-05 | global batch size: 512 | lm loss: 4.936416E+00 | loss scale: 131072.0 | grad norm: 0.117 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.503 | TFLOPs: 71.38 | +[default7]: iteration 2984/ 3814 | consumed samples: 1527808 | consumed tokens: 3128950784 | elapsed time per iteration (s): 5.42 | learning rate: 1.358E-05 | global batch size: 512 | lm loss: 4.908954E+00 | loss scale: 131072.0 | grad norm: 0.105 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.408 | TFLOPs: 71.31 | +[default7]: iteration 2985/ 3814 | consumed samples: 1528320 | consumed tokens: 3129999360 | elapsed time per iteration (s): 5.42 | learning rate: 1.355E-05 | global batch size: 512 | lm loss: 4.925193E+00 | loss scale: 131072.0 | grad norm: 0.113 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.521 | TFLOPs: 71.39 | +[default7]: iteration 2986/ 3814 | consumed samples: 1528832 | consumed tokens: 3131047936 | elapsed time per iteration (s): 5.44 | learning rate: 1.352E-05 | global batch size: 512 | lm loss: 4.895463E+00 | loss scale: 131072.0 | grad norm: 0.122 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.143 | TFLOPs: 71.11 | +[default7]: iteration 2987/ 3814 | consumed samples: 1529344 | consumed tokens: 3132096512 | elapsed time per iteration (s): 5.44 | learning rate: 1.349E-05 | global batch size: 512 | lm loss: 4.904925E+00 | loss scale: 131072.0 | grad norm: 0.108 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.183 | TFLOPs: 71.14 | +[default7]: iteration 2988/ 3814 | consumed samples: 1529856 | consumed tokens: 3133145088 | elapsed time per iteration (s): 5.45 | learning rate: 1.346E-05 | global batch size: 512 | lm loss: 4.883828E+00 | loss scale: 131072.0 | grad norm: 0.096 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.022 | TFLOPs: 71.02 | +[default7]: iteration 2989/ 3814 | consumed samples: 1530368 | consumed tokens: 3134193664 | elapsed time per iteration (s): 5.44 | learning rate: 1.343E-05 | global batch size: 512 | lm loss: 4.925210E+00 | loss scale: 131072.0 | grad norm: 0.103 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.129 | TFLOPs: 71.10 | +[default7]: iteration 2990/ 3814 | consumed samples: 1530880 | consumed tokens: 3135242240 | elapsed time per iteration (s): 5.44 | learning rate: 1.340E-05 | global batch size: 512 | lm loss: 4.935115E+00 | loss scale: 131072.0 | grad norm: 0.096 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.062 | TFLOPs: 71.05 | +[default7]: iteration 2991/ 3814 | consumed samples: 1531392 | consumed tokens: 3136290816 | elapsed time per iteration (s): 5.44 | learning rate: 1.337E-05 | global batch size: 512 | lm loss: 4.897800E+00 | loss scale: 131072.0 | grad norm: 0.098 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.073 | TFLOPs: 71.05 | +[default7]: iteration 2992/ 3814 | consumed samples: 1531904 | consumed tokens: 3137339392 | elapsed time per iteration (s): 5.45 | learning rate: 1.333E-05 | global batch size: 512 | lm loss: 4.927420E+00 | loss scale: 131072.0 | grad norm: 0.103 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.872 | TFLOPs: 70.90 | +[default7]: iteration 2993/ 3814 | consumed samples: 1532416 | consumed tokens: 3138387968 | elapsed time per iteration (s): 5.45 | learning rate: 1.330E-05 | global batch size: 512 | lm loss: 4.913295E+00 | loss scale: 131072.0 | grad norm: 0.090 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.937 | TFLOPs: 70.95 | +[default7]: iteration 2994/ 3814 | consumed samples: 1532928 | consumed tokens: 3139436544 | elapsed time per iteration (s): 5.46 | learning rate: 1.327E-05 | global batch size: 512 | lm loss: 4.930776E+00 | loss scale: 131072.0 | grad norm: 0.095 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.844 | TFLOPs: 70.88 | +[default7]: iteration 2995/ 3814 | consumed samples: 1533440 | consumed tokens: 3140485120 | elapsed time per iteration (s): 5.44 | learning rate: 1.324E-05 | global batch size: 512 | lm loss: 4.898215E+00 | loss scale: 131072.0 | grad norm: 0.094 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.067 | TFLOPs: 71.05 | +[default7]: iteration 2996/ 3814 | consumed samples: 1533952 | consumed tokens: 3141533696 | elapsed time per iteration (s): 5.44 | learning rate: 1.321E-05 | global batch size: 512 | lm loss: 4.930140E+00 | loss scale: 131072.0 | grad norm: 0.090 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.144 | TFLOPs: 71.11 | +[default7]: iteration 2997/ 3814 | consumed samples: 1534464 | consumed tokens: 3142582272 | elapsed time per iteration (s): 5.46 | learning rate: 1.318E-05 | global batch size: 512 | lm loss: 4.883051E+00 | loss scale: 131072.0 | grad norm: 0.106 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.831 | TFLOPs: 70.87 | +[default7]: iteration 2998/ 3814 | consumed samples: 1534976 | consumed tokens: 3143630848 | elapsed time per iteration (s): 5.47 | learning rate: 1.315E-05 | global batch size: 512 | lm loss: 4.910034E+00 | loss scale: 131072.0 | grad norm: 0.091 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.604 | TFLOPs: 70.70 | +[default7]: iteration 2999/ 3814 | consumed samples: 1535488 | consumed tokens: 3144679424 | elapsed time per iteration (s): 5.45 | learning rate: 1.312E-05 | global batch size: 512 | lm loss: 4.911672E+00 | loss scale: 131072.0 | grad norm: 0.106 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.875 | TFLOPs: 70.90 | +[default7]: iteration 3000/ 3814 | consumed samples: 1536000 | consumed tokens: 3145728000 | elapsed time per iteration (s): 5.47 | learning rate: 1.309E-05 | global batch size: 512 | lm loss: 4.905451E+00 | loss scale: 262144.0 | grad norm: 0.047 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.648 | TFLOPs: 70.73 | +[default7]:------------------------------------------------------------------------------------------ +[default7]:valid loss at iteration 3000 | lm loss value: 4.858368E+00 | lm loss PPL: 1.288139E+02 | +[default7]:------------------------------------------------------------------------------------------ +[default1]:[2023-02-16 17:05:16,873] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/layer_01-model_01-model_states.pt... +[default0]:saving checkpoint at iteration 3000 to /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main +[default0]:[2023-02-16 17:05:16,869] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step3000 is begin to save! +[default0]:[2023-02-16 17:05:16,873] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/layer_01-model_00-model_states.pt... +[default1]:[2023-02-16 17:05:17,057] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/layer_01-model_01-model_states.pt. +[default1]:[2023-02-16 17:05:17,057] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/layer_04-model_01-model_states.pt... +[default0]:[2023-02-16 17:05:17,052] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/layer_01-model_00-model_states.pt. +[default0]:[2023-02-16 17:05:17,053] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/layer_04-model_00-model_states.pt... +[default0]:[2023-02-16 17:05:17,118] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/layer_04-model_00-model_states.pt. +[default0]:[2023-02-16 17:05:17,118] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/layer_05-model_00-model_states.pt... +[default1]:[2023-02-16 17:05:17,123] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/layer_04-model_01-model_states.pt. +[default1]:[2023-02-16 17:05:17,124] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/layer_05-model_01-model_states.pt... +[default1]:[2023-02-16 17:05:17,186] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/layer_05-model_01-model_states.pt. +[default1]:[2023-02-16 17:05:17,186] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/layer_06-model_01-model_states.pt... +[default0]:[2023-02-16 17:05:17,179] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/layer_05-model_00-model_states.pt. +[default0]:[2023-02-16 17:05:17,179] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/layer_06-model_00-model_states.pt... +[default1]:[2023-02-16 17:05:17,250] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/layer_06-model_01-model_states.pt. +[default1]:[2023-02-16 17:05:17,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/layer_07-model_01-model_states.pt... +[default0]:[2023-02-16 17:05:17,244] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/layer_06-model_00-model_states.pt. +[default0]:[2023-02-16 17:05:17,245] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/layer_07-model_00-model_states.pt... +[default0]:[2023-02-16 17:05:17,308] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/layer_07-model_00-model_states.pt. +[default0]:[2023-02-16 17:05:17,309] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/layer_08-model_00-model_states.pt... +[default1]:[2023-02-16 17:05:17,316] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/layer_07-model_01-model_states.pt. +[default1]:[2023-02-16 17:05:17,316] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/layer_08-model_01-model_states.pt... +[default1]:[2023-02-16 17:05:17,378] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/layer_08-model_01-model_states.pt. +[default1]:[2023-02-16 17:05:17,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/layer_09-model_01-model_states.pt... +[default0]:[2023-02-16 17:05:17,373] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/layer_08-model_00-model_states.pt. +[default0]:[2023-02-16 17:05:17,373] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/layer_09-model_00-model_states.pt... +[default1]:[2023-02-16 17:05:17,445] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/layer_09-model_01-model_states.pt. +[default1]:[2023-02-16 17:05:17,446] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/layer_10-model_01-model_states.pt... +[default1]:[2023-02-16 17:05:17,508] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/layer_10-model_01-model_states.pt. +[default1]:[2023-02-16 17:05:17,508] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/layer_11-model_01-model_states.pt... +[default0]:[2023-02-16 17:05:17,440] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/layer_09-model_00-model_states.pt. +[default0]:[2023-02-16 17:05:17,440] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/layer_10-model_00-model_states.pt... +[default0]:[2023-02-16 17:05:17,504] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/layer_10-model_00-model_states.pt. +[default0]:[2023-02-16 17:05:17,504] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/layer_11-model_00-model_states.pt... +[default1]:[2023-02-16 17:05:17,567] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/layer_11-model_01-model_states.pt. +[default1]:[2023-02-16 17:05:17,568] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/layer_12-model_01-model_states.pt... +[default0]:[2023-02-16 17:05:17,565] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/layer_11-model_00-model_states.pt. +[default0]:[2023-02-16 17:05:17,566] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/layer_12-model_00-model_states.pt... +[default1]:[2023-02-16 17:05:17,632] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/layer_12-model_01-model_states.pt. +[default1]:[2023-02-16 17:05:17,632] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/layer_13-model_01-model_states.pt... +[default1]:[2023-02-16 17:05:17,696] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/layer_13-model_01-model_states.pt. +[default1]:[2023-02-16 17:05:17,696] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/layer_14-model_01-model_states.pt... +[default0]:[2023-02-16 17:05:17,630] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/layer_12-model_00-model_states.pt. +[default0]:[2023-02-16 17:05:17,631] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/layer_13-model_00-model_states.pt... +[default0]:[2023-02-16 17:05:17,692] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/layer_13-model_00-model_states.pt. +[default0]:[2023-02-16 17:05:17,692] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/layer_14-model_00-model_states.pt... +[default1]:[2023-02-16 17:05:17,759] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/layer_14-model_01-model_states.pt. +[default1]:[2023-02-16 17:05:17,760] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/layer_15-model_01-model_states.pt... +[default0]:[2023-02-16 17:05:17,753] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/layer_14-model_00-model_states.pt. +[default0]:[2023-02-16 17:05:17,754] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/layer_15-model_00-model_states.pt... +[default0]:[2023-02-16 17:05:17,817] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/layer_15-model_00-model_states.pt. +[default0]:[2023-02-16 17:05:17,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/layer_16-model_00-model_states.pt... +[default1]:[2023-02-16 17:05:17,823] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/layer_15-model_01-model_states.pt. +[default1]:[2023-02-16 17:05:17,823] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/layer_16-model_01-model_states.pt... +[default1]:[2023-02-16 17:05:17,887] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/layer_16-model_01-model_states.pt. +[default1]:[2023-02-16 17:05:17,888] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/layer_17-model_01-model_states.pt... +[default0]:[2023-02-16 17:05:17,880] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/layer_16-model_00-model_states.pt. +[default0]:[2023-02-16 17:05:17,881] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/layer_17-model_00-model_states.pt... +[default1]:[2023-02-16 17:05:17,950] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/layer_17-model_01-model_states.pt. +[default1]:[2023-02-16 17:05:17,951] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/layer_18-model_01-model_states.pt... +[default1]:[2023-02-16 17:05:18,012] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/layer_18-model_01-model_states.pt. +[default1]:[2023-02-16 17:05:18,013] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/layer_19-model_01-model_states.pt... +[default0]:[2023-02-16 17:05:17,944] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/layer_17-model_00-model_states.pt. +[default0]:[2023-02-16 17:05:17,944] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/layer_18-model_00-model_states.pt... +[default0]:[2023-02-16 17:05:18,006] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/layer_18-model_00-model_states.pt. +[default0]:[2023-02-16 17:05:18,007] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/layer_19-model_00-model_states.pt... +[default1]:[2023-02-16 17:05:18,075] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/layer_19-model_01-model_states.pt. +[default1]:[2023-02-16 17:05:18,076] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/layer_20-model_01-model_states.pt... +[default0]:[2023-02-16 17:05:18,068] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/layer_19-model_00-model_states.pt. +[default0]:[2023-02-16 17:05:18,069] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/layer_20-model_00-model_states.pt... +[default1]:[2023-02-16 17:05:18,139] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/layer_20-model_01-model_states.pt. +[default1]:[2023-02-16 17:05:18,140] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/layer_21-model_01-model_states.pt... +[default1]:[2023-02-16 17:05:18,203] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/layer_21-model_01-model_states.pt. +[default1]:[2023-02-16 17:05:18,203] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/layer_22-model_01-model_states.pt... +[default0]:[2023-02-16 17:05:18,132] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/layer_20-model_00-model_states.pt. +[default0]:[2023-02-16 17:05:18,132] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/layer_21-model_00-model_states.pt... +[default0]:[2023-02-16 17:05:18,195] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/layer_21-model_00-model_states.pt. +[default0]:[2023-02-16 17:05:18,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/layer_22-model_00-model_states.pt... +[default1]:[2023-02-16 17:05:18,263] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/layer_22-model_01-model_states.pt. +[default1]:[2023-02-16 17:05:18,264] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/layer_23-model_01-model_states.pt... +[default0]:[2023-02-16 17:05:18,261] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/layer_22-model_00-model_states.pt. +[default0]:[2023-02-16 17:05:18,262] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/layer_23-model_00-model_states.pt... +[default0]:[2023-02-16 17:05:18,326] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/layer_23-model_00-model_states.pt. +[default0]:[2023-02-16 17:05:18,327] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/layer_24-model_00-model_states.pt... +[default1]:[2023-02-16 17:05:18,331] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/layer_23-model_01-model_states.pt. +[default1]:[2023-02-16 17:05:18,331] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/layer_24-model_01-model_states.pt... +[default1]:[2023-02-16 17:05:18,396] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/layer_24-model_01-model_states.pt. +[default1]:[2023-02-16 17:05:18,397] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/layer_25-model_01-model_states.pt... +[default0]:[2023-02-16 17:05:18,391] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/layer_24-model_00-model_states.pt. +[default0]:[2023-02-16 17:05:18,392] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/layer_25-model_00-model_states.pt... +[default1]:[2023-02-16 17:05:18,458] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/layer_25-model_01-model_states.pt. +[default1]:[2023-02-16 17:05:18,459] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/layer_26-model_01-model_states.pt... +[default0]:[2023-02-16 17:05:18,453] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/layer_25-model_00-model_states.pt. +[default0]:[2023-02-16 17:05:18,453] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/layer_26-model_00-model_states.pt... +[default0]:[2023-02-16 17:05:18,515] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/layer_26-model_00-model_states.pt. +[default0]:[2023-02-16 17:05:18,515] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/layer_27-model_00-model_states.pt... +[default1]:[2023-02-16 17:05:18,521] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/layer_26-model_01-model_states.pt. +[default1]:[2023-02-16 17:05:18,522] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/layer_27-model_01-model_states.pt... +[default1]:[2023-02-16 17:05:18,584] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/layer_27-model_01-model_states.pt. +[default1]:[2023-02-16 17:05:18,584] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/layer_28-model_01-model_states.pt... +[default0]:[2023-02-16 17:05:18,578] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/layer_27-model_00-model_states.pt. +[default0]:[2023-02-16 17:05:18,579] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/layer_28-model_00-model_states.pt... +[default1]:[2023-02-16 17:05:18,647] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/layer_28-model_01-model_states.pt. +[default1]:[2023-02-16 17:05:18,647] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/layer_29-model_01-model_states.pt... +[default1]:[2023-02-16 17:05:18,710] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/layer_29-model_01-model_states.pt. +[default1]:[2023-02-16 17:05:18,710] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/layer_30-model_01-model_states.pt... +[default0]:[2023-02-16 17:05:18,642] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/layer_28-model_00-model_states.pt. +[default0]:[2023-02-16 17:05:18,643] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/layer_29-model_00-model_states.pt... +[default0]:[2023-02-16 17:05:18,704] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/layer_29-model_00-model_states.pt. +[default0]:[2023-02-16 17:05:18,705] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/layer_30-model_00-model_states.pt... +[default1]:[2023-02-16 17:05:18,774] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/layer_30-model_01-model_states.pt. +[default1]:[2023-02-16 17:05:18,774] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/layer_31-model_01-model_states.pt... +[default0]:[2023-02-16 17:05:18,775] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/layer_30-model_00-model_states.pt. +[default0]:[2023-02-16 17:05:18,776] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/layer_31-model_00-model_states.pt... +[default1]:[2023-02-16 17:05:18,839] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/layer_31-model_01-model_states.pt. +[default1]:[2023-02-16 17:05:18,840] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/layer_32-model_01-model_states.pt... +[default1]:[2023-02-16 17:05:18,902] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/layer_32-model_01-model_states.pt. +[default1]:[2023-02-16 17:05:18,903] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/layer_33-model_01-model_states.pt... +[default0]:[2023-02-16 17:05:18,844] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/layer_31-model_00-model_states.pt. +[default0]:[2023-02-16 17:05:18,844] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/layer_32-model_00-model_states.pt... +[default0]:[2023-02-16 17:05:18,907] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/layer_32-model_00-model_states.pt. +[default0]:[2023-02-16 17:05:18,908] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/layer_33-model_00-model_states.pt... +[default1]:[2023-02-16 17:05:18,968] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/layer_33-model_01-model_states.pt. +[default1]:[2023-02-16 17:05:18,968] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/layer_34-model_01-model_states.pt... +[default0]:[2023-02-16 17:05:18,973] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/layer_33-model_00-model_states.pt. +[default0]:[2023-02-16 17:05:18,973] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/layer_34-model_00-model_states.pt... +[default1]:[2023-02-16 17:05:19,031] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/layer_34-model_01-model_states.pt. +[default1]:[2023-02-16 17:05:19,031] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/layer_35-model_01-model_states.pt... +[default1]:[2023-02-16 17:05:19,093] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/layer_35-model_01-model_states.pt. +[default1]:[2023-02-16 17:05:19,094] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/layer_37-model_01-model_states.pt... +[default1]:[2023-02-16 17:05:19,140] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/zero_pp_rank_16_mp_rank_01_optim_states.pt... +[default5]:[2023-02-16 17:05:19,140] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/zero_pp_rank_14_mp_rank_01_optim_states.pt... +[default1]:[2023-02-16 17:05:19,140] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/zero_pp_rank_12_mp_rank_01_optim_states.pt... +[default7]:[2023-02-16 17:05:19,140] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/zero_pp_rank_15_mp_rank_01_optim_states.pt... +[default3]:[2023-02-16 17:05:19,140] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/zero_pp_rank_29_mp_rank_01_optim_states.pt... +[default7]:[2023-02-16 17:05:19,140] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/zero_pp_rank_19_mp_rank_01_optim_states.pt... +[default4]:[2023-02-16 17:05:19,140] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/zero_pp_rank_22_mp_rank_00_optim_states.pt... +[default1]:[2023-02-16 17:05:19,095] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/layer_37-model_01-model_states.pt. +[default1]:[2023-02-16 17:05:19,097] [INFO] [logging.py:68:log_dist] [Rank 1] Saving model checkpoint: /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/mp_rank_01_model_states.pt +[default1]:[2023-02-16 17:05:19,097] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/mp_rank_01_model_states.pt... +[default1]:[2023-02-16 17:05:19,099] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/mp_rank_01_model_states.pt. +[default0]:[2023-02-16 17:05:19,036] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/layer_34-model_00-model_states.pt. +[default0]:[2023-02-16 17:05:19,037] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/layer_35-model_00-model_states.pt... +[default0]:[2023-02-16 17:05:19,100] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/layer_35-model_00-model_states.pt. +[default0]:[2023-02-16 17:05:19,100] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/layer_37-model_00-model_states.pt... +[default0]:[2023-02-16 17:05:19,101] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/layer_37-model_00-model_states.pt. +[default0]:[2023-02-16 17:05:19,102] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/mp_rank_00_model_states.pt +[default0]:[2023-02-16 17:05:19,102] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/mp_rank_00_model_states.pt... +[default0]:[2023-02-16 17:05:19,104] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/mp_rank_00_model_states.pt. +[default6]:[2023-02-16 17:05:19,140] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/zero_pp_rank_23_mp_rank_00_optim_states.pt... +[default0]:[2023-02-16 17:05:19,140] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/zero_pp_rank_24_mp_rank_00_optim_states.pt... +[default5]:[2023-02-16 17:05:19,140] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/zero_pp_rank_6_mp_rank_01_optim_states.pt... +[default7]:[2023-02-16 17:05:19,140] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/zero_pp_rank_7_mp_rank_01_optim_states.pt... +[default7]:[2023-02-16 17:05:19,140] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/zero_pp_rank_23_mp_rank_01_optim_states.pt... +[default1]:[2023-02-16 17:05:19,140] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/zero_pp_rank_20_mp_rank_01_optim_states.pt... +[default4]:[2023-02-16 17:05:19,140] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/zero_pp_rank_6_mp_rank_00_optim_states.pt... +[default0]:[2023-02-16 17:05:19,140] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/zero_pp_rank_4_mp_rank_00_optim_states.pt... +[default4]:[2023-02-16 17:05:19,140] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/zero_pp_rank_18_mp_rank_00_optim_states.pt... +[default0]:[2023-02-16 17:05:19,140] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/zero_pp_rank_20_mp_rank_00_optim_states.pt... +[default7]:[2023-02-16 17:05:19,140] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/zero_pp_rank_31_mp_rank_01_optim_states.pt... +[default6]:[2023-02-16 17:05:19,140] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/zero_pp_rank_19_mp_rank_00_optim_states.pt... +[default3]:[2023-02-16 17:05:19,140] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/zero_pp_rank_25_mp_rank_01_optim_states.pt... +[default2]:[2023-02-16 17:05:19,140] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/zero_pp_rank_5_mp_rank_00_optim_states.pt... +[default2]:[2023-02-16 17:05:19,140] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/zero_pp_rank_17_mp_rank_00_optim_states.pt... +[default0]:[2023-02-16 17:05:19,140] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/zero_pp_rank_28_mp_rank_00_optim_states.pt... +[default6]:[2023-02-16 17:05:19,140] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/zero_pp_rank_31_mp_rank_00_optim_states.pt... +[default1]:[2023-02-16 17:05:19,140] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/zero_pp_rank_28_mp_rank_01_optim_states.pt... +[default3]:[2023-02-16 17:05:19,140] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/zero_pp_rank_21_mp_rank_01_optim_states.pt... +[default6]:[2023-02-16 17:05:19,140] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/zero_pp_rank_27_mp_rank_00_optim_states.pt... +[default4]:[2023-02-16 17:05:19,140] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/zero_pp_rank_10_mp_rank_00_optim_states.pt... +[default3]:[2023-02-16 17:05:19,140] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/zero_pp_rank_1_mp_rank_01_optim_states.pt... +[default6]:[2023-02-16 17:05:19,140] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/zero_pp_rank_3_mp_rank_00_optim_states.pt... +[default5]:[2023-02-16 17:05:19,140] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/zero_pp_rank_2_mp_rank_01_optim_states.pt... +[default5]:[2023-02-16 17:05:19,140] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/zero_pp_rank_26_mp_rank_01_optim_states.pt... +[default0]:[2023-02-16 17:05:19,140] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/zero_pp_rank_8_mp_rank_00_optim_states.pt... +[default1]:[2023-02-16 17:05:19,140] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/zero_pp_rank_8_mp_rank_01_optim_states.pt... +[default5]:[2023-02-16 17:05:19,140] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/zero_pp_rank_22_mp_rank_01_optim_states.pt... +[default6]:[2023-02-16 17:05:19,140] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/zero_pp_rank_7_mp_rank_00_optim_states.pt... +[default4]:[2023-02-16 17:05:19,140] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/zero_pp_rank_26_mp_rank_00_optim_states.pt... +[default4]:[2023-02-16 17:05:19,140] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/zero_pp_rank_30_mp_rank_00_optim_states.pt... +[default5]:[2023-02-16 17:05:19,140] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/zero_pp_rank_30_mp_rank_01_optim_states.pt... +[default5]:[2023-02-16 17:05:19,140] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/zero_pp_rank_18_mp_rank_01_optim_states.pt... +[default7]:[2023-02-16 17:05:19,140] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/zero_pp_rank_27_mp_rank_01_optim_states.pt... +[default3]:[2023-02-16 17:05:19,140] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/zero_pp_rank_9_mp_rank_01_optim_states.pt... +[default3]:[2023-02-16 17:05:19,140] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/zero_pp_rank_17_mp_rank_01_optim_states.pt... +[default3]:[2023-02-16 17:05:19,140] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/zero_pp_rank_13_mp_rank_01_optim_states.pt... +[default1]:[2023-02-16 17:05:19,140] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/zero_pp_rank_4_mp_rank_01_optim_states.pt... +[default2]:[2023-02-16 17:05:19,140] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/zero_pp_rank_21_mp_rank_00_optim_states.pt... +[default2]:[2023-02-16 17:05:19,140] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/zero_pp_rank_13_mp_rank_00_optim_states.pt... +[default1]:[2023-02-16 17:05:19,140] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/zero_pp_rank_0_mp_rank_01_optim_states.pt... +[default7]:[2023-02-16 17:05:19,140] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/zero_pp_rank_11_mp_rank_01_optim_states.pt... +[default6]:[2023-02-16 17:05:19,140] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/zero_pp_rank_11_mp_rank_00_optim_states.pt... +[default2]:[2023-02-16 17:05:19,140] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/zero_pp_rank_29_mp_rank_00_optim_states.pt... +[default3]:[2023-02-16 17:05:19,140] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/zero_pp_rank_5_mp_rank_01_optim_states.pt... +[default7]:[2023-02-16 17:05:19,140] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/zero_pp_rank_3_mp_rank_01_optim_states.pt... +[default4]:[2023-02-16 17:05:19,140] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/zero_pp_rank_14_mp_rank_00_optim_states.pt... +[default6]:[2023-02-16 17:05:19,140] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/zero_pp_rank_15_mp_rank_00_optim_states.pt... +[default0]:[2023-02-16 17:05:19,140] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/zero_pp_rank_12_mp_rank_00_optim_states.pt... +[default2]:[2023-02-16 17:05:19,140] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/zero_pp_rank_25_mp_rank_00_optim_states.pt... +[default2]:[2023-02-16 17:05:19,140] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/zero_pp_rank_9_mp_rank_00_optim_states.pt... +[default0]:[2023-02-16 17:05:19,140] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/zero_pp_rank_0_mp_rank_00_optim_states.pt... +[default5]:[2023-02-16 17:05:19,140] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/zero_pp_rank_10_mp_rank_01_optim_states.pt... +[default1]:[2023-02-16 17:05:19,140] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/zero_pp_rank_24_mp_rank_01_optim_states.pt... +[default4]:[2023-02-16 17:05:19,140] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/zero_pp_rank_2_mp_rank_00_optim_states.pt... +[default0]:[2023-02-16 17:05:19,140] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/zero_pp_rank_16_mp_rank_00_optim_states.pt... +[default2]:[2023-02-16 17:05:19,140] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/zero_pp_rank_1_mp_rank_00_optim_states.pt... +[default7]:[2023-02-16 17:05:19,585] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/zero_pp_rank_11_mp_rank_01_optim_states.pt. +[default7]:[2023-02-16 17:05:19,586] [INFO] [engine.py:3199:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/zero_pp_rank_11_mp_rank_01_optim_states.pt +[default7]:[2023-02-16 17:05:19,586] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +[default2]:[2023-02-16 17:05:19,675] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/zero_pp_rank_21_mp_rank_00_optim_states.pt. +[default2]:[2023-02-16 17:05:19,675] [INFO] [engine.py:3199:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/zero_pp_rank_21_mp_rank_00_optim_states.pt +[default2]:[2023-02-16 17:05:19,675] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +[default3]:[2023-02-16 17:05:19,680] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/zero_pp_rank_13_mp_rank_01_optim_states.pt. +[default3]:[2023-02-16 17:05:19,680] [INFO] [engine.py:3199:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/zero_pp_rank_13_mp_rank_01_optim_states.pt +[default3]:[2023-02-16 17:05:19,680] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +[default5]:[2023-02-16 17:05:19,638] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/zero_pp_rank_10_mp_rank_01_optim_states.pt. +[default5]:[2023-02-16 17:05:19,638] [INFO] [engine.py:3199:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/zero_pp_rank_10_mp_rank_01_optim_states.pt +[default5]:[2023-02-16 17:05:19,639] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +[default0]:[2023-02-16 17:05:19,757] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/zero_pp_rank_24_mp_rank_00_optim_states.pt. +[default0]:[2023-02-16 17:05:19,759] [INFO] [engine.py:3199:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/zero_pp_rank_24_mp_rank_00_optim_states.pt +[default0]:[2023-02-16 17:05:19,759] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +[default7]:[2023-02-16 17:05:19,697] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/zero_pp_rank_7_mp_rank_01_optim_states.pt. +[default7]:[2023-02-16 17:05:19,697] [INFO] [engine.py:3199:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/zero_pp_rank_7_mp_rank_01_optim_states.pt +[default7]:[2023-02-16 17:05:19,697] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +[default6]:[2023-02-16 17:05:19,694] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/zero_pp_rank_31_mp_rank_00_optim_states.pt. +[default6]:[2023-02-16 17:05:19,694] [INFO] [engine.py:3199:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/zero_pp_rank_31_mp_rank_00_optim_states.pt +[default6]:[2023-02-16 17:05:19,694] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +[default4]:[2023-02-16 17:05:19,779] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/zero_pp_rank_10_mp_rank_00_optim_states.pt. +[default4]:[2023-02-16 17:05:19,780] [INFO] [engine.py:3199:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/zero_pp_rank_10_mp_rank_00_optim_states.pt +[default4]:[2023-02-16 17:05:19,780] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +[default0]:[2023-02-16 17:05:19,760] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/zero_pp_rank_8_mp_rank_00_optim_states.pt. +[default0]:[2023-02-16 17:05:19,760] [INFO] [engine.py:3199:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/zero_pp_rank_8_mp_rank_00_optim_states.pt +[default0]:[2023-02-16 17:05:19,760] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +[default1]:[2023-02-16 17:05:19,794] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/zero_pp_rank_8_mp_rank_01_optim_states.pt. +[default1]:[2023-02-16 17:05:19,794] [INFO] [engine.py:3199:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/zero_pp_rank_8_mp_rank_01_optim_states.pt +[default1]:[2023-02-16 17:05:19,794] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +[default5]:[2023-02-16 17:05:19,790] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/zero_pp_rank_22_mp_rank_01_optim_states.pt. +[default5]:[2023-02-16 17:05:19,790] [INFO] [engine.py:3199:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/zero_pp_rank_22_mp_rank_01_optim_states.pt +[default5]:[2023-02-16 17:05:19,790] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +[default4]:[2023-02-16 17:05:19,709] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/zero_pp_rank_30_mp_rank_00_optim_states.pt. +[default4]:[2023-02-16 17:05:19,709] [INFO] [engine.py:3199:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/zero_pp_rank_30_mp_rank_00_optim_states.pt +[default4]:[2023-02-16 17:05:19,709] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +[default3]:[2023-02-16 17:05:19,799] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/zero_pp_rank_9_mp_rank_01_optim_states.pt. +[default3]:[2023-02-16 17:05:19,799] [INFO] [engine.py:3199:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/zero_pp_rank_9_mp_rank_01_optim_states.pt +[default3]:[2023-02-16 17:05:19,799] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +[default6]:[2023-02-16 17:05:19,773] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/zero_pp_rank_11_mp_rank_00_optim_states.pt. +[default6]:[2023-02-16 17:05:19,773] [INFO] [engine.py:3199:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/zero_pp_rank_11_mp_rank_00_optim_states.pt +[default6]:[2023-02-16 17:05:19,773] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +[default0]:[2023-02-16 17:05:19,824] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/zero_pp_rank_12_mp_rank_00_optim_states.pt. +[default0]:[2023-02-16 17:05:19,824] [INFO] [engine.py:3199:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/zero_pp_rank_12_mp_rank_00_optim_states.pt +[default0]:[2023-02-16 17:05:19,825] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +[default2]:[2023-02-16 17:05:19,758] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/zero_pp_rank_9_mp_rank_00_optim_states.pt. +[default2]:[2023-02-16 17:05:19,758] [INFO] [engine.py:3199:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/zero_pp_rank_9_mp_rank_00_optim_states.pt +[default2]:[2023-02-16 17:05:19,759] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +[default0]:[2023-02-16 17:05:19,824] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/zero_pp_rank_16_mp_rank_00_optim_states.pt. +[default0]:[2023-02-16 17:05:19,825] [INFO] [engine.py:3199:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/zero_pp_rank_16_mp_rank_00_optim_states.pt +[default0]:[2023-02-16 17:05:19,825] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +[default1]:[2023-02-16 17:05:19,811] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/zero_pp_rank_16_mp_rank_01_optim_states.pt. +[default1]:[2023-02-16 17:05:19,811] [INFO] [engine.py:3199:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/zero_pp_rank_16_mp_rank_01_optim_states.pt +[default1]:[2023-02-16 17:05:19,811] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +[default3]:[2023-02-16 17:05:19,809] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/zero_pp_rank_29_mp_rank_01_optim_states.pt. +[default3]:[2023-02-16 17:05:19,809] [INFO] [engine.py:3199:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/zero_pp_rank_29_mp_rank_01_optim_states.pt +[default3]:[2023-02-16 17:05:19,809] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +[default7]:[2023-02-16 17:05:19,850] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/zero_pp_rank_19_mp_rank_01_optim_states.pt. +[default7]:[2023-02-16 17:05:19,850] [INFO] [engine.py:3199:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/zero_pp_rank_19_mp_rank_01_optim_states.pt +[default7]:[2023-02-16 17:05:19,850] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +[default6]:[2023-02-16 17:05:19,825] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/zero_pp_rank_23_mp_rank_00_optim_states.pt. +[default6]:[2023-02-16 17:05:19,825] [INFO] [engine.py:3199:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/zero_pp_rank_23_mp_rank_00_optim_states.pt +[default6]:[2023-02-16 17:05:19,825] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +[default5]:[2023-02-16 17:05:19,827] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/zero_pp_rank_6_mp_rank_01_optim_states.pt. +[default5]:[2023-02-16 17:05:19,827] [INFO] [engine.py:3199:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/zero_pp_rank_6_mp_rank_01_optim_states.pt +[default5]:[2023-02-16 17:05:19,827] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +[default7]:[2023-02-16 17:05:19,784] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/zero_pp_rank_23_mp_rank_01_optim_states.pt. +[default7]:[2023-02-16 17:05:19,785] [INFO] [engine.py:3199:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/zero_pp_rank_23_mp_rank_01_optim_states.pt +[default7]:[2023-02-16 17:05:19,785] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +[default3]:[2023-02-16 17:05:19,818] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/zero_pp_rank_1_mp_rank_01_optim_states.pt. +[default3]:[2023-02-16 17:05:19,818] [INFO] [engine.py:3199:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/zero_pp_rank_1_mp_rank_01_optim_states.pt +[default3]:[2023-02-16 17:05:19,818] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +[default1]:[2023-02-16 17:05:19,836] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/zero_pp_rank_20_mp_rank_01_optim_states.pt. +[default1]:[2023-02-16 17:05:19,837] [INFO] [engine.py:3199:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/zero_pp_rank_20_mp_rank_01_optim_states.pt +[default1]:[2023-02-16 17:05:19,837] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +[default4]:[2023-02-16 17:05:19,866] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/zero_pp_rank_6_mp_rank_00_optim_states.pt. +[default4]:[2023-02-16 17:05:19,866] [INFO] [engine.py:3199:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/zero_pp_rank_6_mp_rank_00_optim_states.pt +[default4]:[2023-02-16 17:05:19,866] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +[default0]:[2023-02-16 17:05:19,812] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/zero_pp_rank_4_mp_rank_00_optim_states.pt. +[default0]:[2023-02-16 17:05:19,812] [INFO] [engine.py:3199:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/zero_pp_rank_4_mp_rank_00_optim_states.pt +[default0]:[2023-02-16 17:05:19,813] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +[default4]:[2023-02-16 17:05:19,863] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/zero_pp_rank_18_mp_rank_00_optim_states.pt. +[default4]:[2023-02-16 17:05:19,864] [INFO] [engine.py:3199:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/zero_pp_rank_18_mp_rank_00_optim_states.pt +[default4]:[2023-02-16 17:05:19,864] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +[default0]:[2023-02-16 17:05:19,806] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/zero_pp_rank_20_mp_rank_00_optim_states.pt. +[default0]:[2023-02-16 17:05:19,806] [INFO] [engine.py:3199:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/zero_pp_rank_20_mp_rank_00_optim_states.pt +[default0]:[2023-02-16 17:05:19,806] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +[default7]:[2023-02-16 17:05:19,816] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/zero_pp_rank_31_mp_rank_01_optim_states.pt. +[default7]:[2023-02-16 17:05:19,816] [INFO] [engine.py:3199:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/zero_pp_rank_31_mp_rank_01_optim_states.pt +[default7]:[2023-02-16 17:05:19,816] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +[default6]:[2023-02-16 17:05:19,855] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/zero_pp_rank_19_mp_rank_00_optim_states.pt. +[default6]:[2023-02-16 17:05:19,855] [INFO] [engine.py:3199:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/zero_pp_rank_19_mp_rank_00_optim_states.pt +[default6]:[2023-02-16 17:05:19,855] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +[default2]:[2023-02-16 17:05:19,877] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/zero_pp_rank_5_mp_rank_00_optim_states.pt. +[default2]:[2023-02-16 17:05:19,877] [INFO] [engine.py:3199:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/zero_pp_rank_5_mp_rank_00_optim_states.pt +[default2]:[2023-02-16 17:05:19,877] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +[default2]:[2023-02-16 17:05:19,857] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/zero_pp_rank_17_mp_rank_00_optim_states.pt. +[default2]:[2023-02-16 17:05:19,858] [INFO] [engine.py:3199:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/zero_pp_rank_17_mp_rank_00_optim_states.pt +[default2]:[2023-02-16 17:05:19,858] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +[default0]:[2023-02-16 17:05:19,817] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/zero_pp_rank_28_mp_rank_00_optim_states.pt. +[default0]:[2023-02-16 17:05:19,818] [INFO] [engine.py:3199:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/zero_pp_rank_28_mp_rank_00_optim_states.pt +[default0]:[2023-02-16 17:05:19,818] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +[default1]:[2023-02-16 17:05:19,812] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/zero_pp_rank_28_mp_rank_01_optim_states.pt. +[default1]:[2023-02-16 17:05:19,812] [INFO] [engine.py:3199:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/zero_pp_rank_28_mp_rank_01_optim_states.pt +[default1]:[2023-02-16 17:05:19,812] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +[default3]:[2023-02-16 17:05:19,863] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/zero_pp_rank_21_mp_rank_01_optim_states.pt. +[default3]:[2023-02-16 17:05:19,863] [INFO] [engine.py:3199:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/zero_pp_rank_21_mp_rank_01_optim_states.pt +[default3]:[2023-02-16 17:05:19,863] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +[default6]:[2023-02-16 17:05:19,822] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/zero_pp_rank_3_mp_rank_00_optim_states.pt. +[default6]:[2023-02-16 17:05:19,822] [INFO] [engine.py:3199:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/zero_pp_rank_3_mp_rank_00_optim_states.pt +[default6]:[2023-02-16 17:05:19,822] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +[default5]:[2023-02-16 17:05:19,853] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/zero_pp_rank_2_mp_rank_01_optim_states.pt. +[default5]:[2023-02-16 17:05:19,853] [INFO] [engine.py:3199:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/zero_pp_rank_2_mp_rank_01_optim_states.pt +[default5]:[2023-02-16 17:05:19,853] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +[default5]:[2023-02-16 17:05:19,887] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/zero_pp_rank_26_mp_rank_01_optim_states.pt. +[default5]:[2023-02-16 17:05:19,887] [INFO] [engine.py:3199:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/zero_pp_rank_26_mp_rank_01_optim_states.pt +[default5]:[2023-02-16 17:05:19,887] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +[default6]:[2023-02-16 17:05:19,893] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/zero_pp_rank_7_mp_rank_00_optim_states.pt. +[default6]:[2023-02-16 17:05:19,893] [INFO] [engine.py:3199:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/zero_pp_rank_7_mp_rank_00_optim_states.pt +[default6]:[2023-02-16 17:05:19,893] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +[default5]:[2023-02-16 17:05:19,851] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/zero_pp_rank_30_mp_rank_01_optim_states.pt. +[default5]:[2023-02-16 17:05:19,851] [INFO] [engine.py:3199:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/zero_pp_rank_30_mp_rank_01_optim_states.pt +[default5]:[2023-02-16 17:05:19,851] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +[default5]:[2023-02-16 17:05:19,811] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/zero_pp_rank_18_mp_rank_01_optim_states.pt. +[default5]:[2023-02-16 17:05:19,811] [INFO] [engine.py:3199:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/zero_pp_rank_18_mp_rank_01_optim_states.pt +[default5]:[2023-02-16 17:05:19,811] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +[default3]:[2023-02-16 17:05:19,873] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/zero_pp_rank_17_mp_rank_01_optim_states.pt. +[default3]:[2023-02-16 17:05:19,873] [INFO] [engine.py:3199:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/zero_pp_rank_17_mp_rank_01_optim_states.pt +[default3]:[2023-02-16 17:05:19,874] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +[default1]:[2023-02-16 17:05:19,871] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/zero_pp_rank_4_mp_rank_01_optim_states.pt. +[default1]:[2023-02-16 17:05:19,871] [INFO] [engine.py:3199:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/zero_pp_rank_4_mp_rank_01_optim_states.pt +[default1]:[2023-02-16 17:05:19,872] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +[default2]:[2023-02-16 17:05:19,844] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/zero_pp_rank_13_mp_rank_00_optim_states.pt. +[default2]:[2023-02-16 17:05:19,844] [INFO] [engine.py:3199:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/zero_pp_rank_13_mp_rank_00_optim_states.pt +[default2]:[2023-02-16 17:05:19,844] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +[default1]:[2023-02-16 17:05:19,909] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/zero_pp_rank_0_mp_rank_01_optim_states.pt. +[default1]:[2023-02-16 17:05:19,909] [INFO] [engine.py:3199:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/zero_pp_rank_0_mp_rank_01_optim_states.pt +[default1]:[2023-02-16 17:05:19,909] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +[default2]:[2023-02-16 17:05:19,844] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/zero_pp_rank_29_mp_rank_00_optim_states.pt. +[default2]:[2023-02-16 17:05:19,844] [INFO] [engine.py:3199:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/zero_pp_rank_29_mp_rank_00_optim_states.pt +[default2]:[2023-02-16 17:05:19,844] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +[default3]:[2023-02-16 17:05:19,879] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/zero_pp_rank_5_mp_rank_01_optim_states.pt. +[default3]:[2023-02-16 17:05:19,879] [INFO] [engine.py:3199:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/zero_pp_rank_5_mp_rank_01_optim_states.pt +[default3]:[2023-02-16 17:05:19,879] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +[default7]:[2023-02-16 17:05:19,882] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/zero_pp_rank_3_mp_rank_01_optim_states.pt. +[default7]:[2023-02-16 17:05:19,882] [INFO] [engine.py:3199:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/zero_pp_rank_3_mp_rank_01_optim_states.pt +[default7]:[2023-02-16 17:05:19,882] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +[default4]:[2023-02-16 17:05:19,925] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/zero_pp_rank_14_mp_rank_00_optim_states.pt. +[default4]:[2023-02-16 17:05:19,925] [INFO] [engine.py:3199:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/zero_pp_rank_14_mp_rank_00_optim_states.pt +[default4]:[2023-02-16 17:05:19,925] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +[default6]:[2023-02-16 17:05:19,904] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/zero_pp_rank_15_mp_rank_00_optim_states.pt. +[default6]:[2023-02-16 17:05:19,905] [INFO] [engine.py:3199:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/zero_pp_rank_15_mp_rank_00_optim_states.pt +[default6]:[2023-02-16 17:05:19,905] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +[default0]:[2023-02-16 17:05:19,912] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/zero_pp_rank_0_mp_rank_00_optim_states.pt. +[default0]:[2023-02-16 17:05:19,914] [INFO] [engine.py:3199:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/zero_pp_rank_0_mp_rank_00_optim_states.pt +[default0]:[2023-02-16 17:05:19,914] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +[default1]:[2023-02-16 17:05:19,887] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/zero_pp_rank_24_mp_rank_01_optim_states.pt. +[default1]:[2023-02-16 17:05:19,887] [INFO] [engine.py:3199:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/zero_pp_rank_24_mp_rank_01_optim_states.pt +[default1]:[2023-02-16 17:05:19,887] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +[default4]:[2023-02-16 17:05:19,838] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/zero_pp_rank_2_mp_rank_00_optim_states.pt. +[default4]:[2023-02-16 17:05:19,839] [INFO] [engine.py:3199:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/zero_pp_rank_2_mp_rank_00_optim_states.pt +[default4]:[2023-02-16 17:05:19,839] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +[default2]:[2023-02-16 17:05:19,840] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/zero_pp_rank_1_mp_rank_00_optim_states.pt. +[default2]:[2023-02-16 17:05:19,840] [INFO] [engine.py:3199:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/zero_pp_rank_1_mp_rank_00_optim_states.pt +[default2]:[2023-02-16 17:05:19,840] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +[default1]:[2023-02-16 17:05:19,882] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/zero_pp_rank_12_mp_rank_01_optim_states.pt. +[default1]:[2023-02-16 17:05:19,883] [INFO] [engine.py:3199:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/zero_pp_rank_12_mp_rank_01_optim_states.pt +[default1]:[2023-02-16 17:05:19,883] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +[default4]:[2023-02-16 17:05:19,889] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/zero_pp_rank_22_mp_rank_00_optim_states.pt. +[default4]:[2023-02-16 17:05:19,889] [INFO] [engine.py:3199:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/zero_pp_rank_22_mp_rank_00_optim_states.pt +[default4]:[2023-02-16 17:05:19,890] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +[default3]:[2023-02-16 17:05:19,885] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/zero_pp_rank_25_mp_rank_01_optim_states.pt. +[default3]:[2023-02-16 17:05:19,885] [INFO] [engine.py:3199:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/zero_pp_rank_25_mp_rank_01_optim_states.pt +[default3]:[2023-02-16 17:05:19,885] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +[default6]:[2023-02-16 17:05:19,978] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/zero_pp_rank_27_mp_rank_00_optim_states.pt. +[default6]:[2023-02-16 17:05:19,978] [INFO] [engine.py:3199:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/zero_pp_rank_27_mp_rank_00_optim_states.pt +[default6]:[2023-02-16 17:05:19,978] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +[default4]:[2023-02-16 17:05:19,967] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/zero_pp_rank_26_mp_rank_00_optim_states.pt. +[default4]:[2023-02-16 17:05:19,967] [INFO] [engine.py:3199:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/zero_pp_rank_26_mp_rank_00_optim_states.pt +[default4]:[2023-02-16 17:05:19,967] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +[default7]:[2023-02-16 17:05:19,911] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/zero_pp_rank_27_mp_rank_01_optim_states.pt. +[default7]:[2023-02-16 17:05:19,911] [INFO] [engine.py:3199:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/zero_pp_rank_27_mp_rank_01_optim_states.pt +[default7]:[2023-02-16 17:05:19,911] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +[default2]:[2023-02-16 17:05:19,946] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/zero_pp_rank_25_mp_rank_00_optim_states.pt. +[default2]:[2023-02-16 17:05:19,946] [INFO] [engine.py:3199:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/zero_pp_rank_25_mp_rank_00_optim_states.pt +[default2]:[2023-02-16 17:05:19,946] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +[default0]: successfully saved checkpoint at iteration 3000 to /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main +[default5]:[2023-02-16 17:05:19,962] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/zero_pp_rank_14_mp_rank_01_optim_states.pt. +[default5]:[2023-02-16 17:05:19,962] [INFO] [engine.py:3199:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/zero_pp_rank_14_mp_rank_01_optim_states.pt +[default5]:[2023-02-16 17:05:19,962] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +[default7]:[2023-02-16 17:05:19,986] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/zero_pp_rank_15_mp_rank_01_optim_states.pt. +[default7]:[2023-02-16 17:05:19,986] [INFO] [engine.py:3199:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/zero_pp_rank_15_mp_rank_01_optim_states.pt +[default7]:[2023-02-16 17:05:19,986] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +[default7]:time (ms) | save-checkpoint: 3118.50 +[default7]: iteration 3001/ 3814 | consumed samples: 1536512 | consumed tokens: 3146776576 | elapsed time per iteration (s): 9.61 | learning rate: 1.306E-05 | global batch size: 512 | lm loss: 4.906946E+00 | loss scale: 262144.0 | grad norm: 0.107 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 53.267 | TFLOPs: 40.23 | +[default7]: iteration 3002/ 3814 | consumed samples: 1537024 | consumed tokens: 3147825152 | elapsed time per iteration (s): 5.47 | learning rate: 1.303E-05 | global batch size: 512 | lm loss: 4.906171E+00 | loss scale: 262144.0 | grad norm: 0.101 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.680 | TFLOPs: 70.76 | +[default7]: iteration 3003/ 3814 | consumed samples: 1537536 | consumed tokens: 3148873728 | elapsed time per iteration (s): 5.46 | learning rate: 1.300E-05 | global batch size: 512 | lm loss: 4.918630E+00 | loss scale: 262144.0 | grad norm: 0.107 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.824 | TFLOPs: 70.87 | +[default7]: iteration 3004/ 3814 | consumed samples: 1538048 | consumed tokens: 3149922304 | elapsed time per iteration (s): 5.45 | learning rate: 1.297E-05 | global batch size: 512 | lm loss: 4.908820E+00 | loss scale: 262144.0 | grad norm: 0.111 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.889 | TFLOPs: 70.92 | +[default7]: iteration 3005/ 3814 | consumed samples: 1538560 | consumed tokens: 3150970880 | elapsed time per iteration (s): 5.46 | learning rate: 1.294E-05 | global batch size: 512 | lm loss: 4.899635E+00 | loss scale: 262144.0 | grad norm: 0.115 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.799 | TFLOPs: 70.85 | +[default7]: iteration 3006/ 3814 | consumed samples: 1539072 | consumed tokens: 3152019456 | elapsed time per iteration (s): 5.43 | learning rate: 1.291E-05 | global batch size: 512 | lm loss: 4.915149E+00 | loss scale: 262144.0 | grad norm: 0.094 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.332 | TFLOPs: 71.25 | +[default7]: iteration 3007/ 3814 | consumed samples: 1539584 | consumed tokens: 3153068032 | elapsed time per iteration (s): 5.48 | learning rate: 1.287E-05 | global batch size: 512 | lm loss: 4.921273E+00 | loss scale: 262144.0 | grad norm: 0.115 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.347 | TFLOPs: 70.51 | +[default7]: iteration 3008/ 3814 | consumed samples: 1540096 | consumed tokens: 3154116608 | elapsed time per iteration (s): 5.49 | learning rate: 1.284E-05 | global batch size: 512 | lm loss: 4.909356E+00 | loss scale: 262144.0 | grad norm: 0.102 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.285 | TFLOPs: 70.46 | +[default7]: iteration 3009/ 3814 | consumed samples: 1540608 | consumed tokens: 3155165184 | elapsed time per iteration (s): 5.51 | learning rate: 1.281E-05 | global batch size: 512 | lm loss: 4.924896E+00 | loss scale: 262144.0 | grad norm: 0.112 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 92.935 | TFLOPs: 70.19 | +[default7]: iteration 3010/ 3814 | consumed samples: 1541120 | consumed tokens: 3156213760 | elapsed time per iteration (s): 5.47 | learning rate: 1.278E-05 | global batch size: 512 | lm loss: 4.909941E+00 | loss scale: 262144.0 | grad norm: 0.088 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.678 | TFLOPs: 70.76 | +[default7]: iteration 3011/ 3814 | consumed samples: 1541632 | consumed tokens: 3157262336 | elapsed time per iteration (s): 5.48 | learning rate: 1.275E-05 | global batch size: 512 | lm loss: 4.905879E+00 | loss scale: 262144.0 | grad norm: 0.097 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.450 | TFLOPs: 70.58 | +[default7]: iteration 3012/ 3814 | consumed samples: 1542144 | consumed tokens: 3158310912 | elapsed time per iteration (s): 5.47 | learning rate: 1.272E-05 | global batch size: 512 | lm loss: 4.928553E+00 | loss scale: 262144.0 | grad norm: 0.093 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.656 | TFLOPs: 70.74 | +[default7]: iteration 3013/ 3814 | consumed samples: 1542656 | consumed tokens: 3159359488 | elapsed time per iteration (s): 5.45 | learning rate: 1.269E-05 | global batch size: 512 | lm loss: 4.927531E+00 | loss scale: 262144.0 | grad norm: 0.097 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.915 | TFLOPs: 70.94 | +[default7]: iteration 3014/ 3814 | consumed samples: 1543168 | consumed tokens: 3160408064 | elapsed time per iteration (s): 5.46 | learning rate: 1.266E-05 | global batch size: 512 | lm loss: 4.916295E+00 | loss scale: 262144.0 | grad norm: 0.095 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.694 | TFLOPs: 70.77 | +[default7]: iteration 3015/ 3814 | consumed samples: 1543680 | consumed tokens: 3161456640 | elapsed time per iteration (s): 5.42 | learning rate: 1.263E-05 | global batch size: 512 | lm loss: 4.921210E+00 | loss scale: 262144.0 | grad norm: 0.092 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.387 | TFLOPs: 71.29 | +[default7]: iteration 3016/ 3814 | consumed samples: 1544192 | consumed tokens: 3162505216 | elapsed time per iteration (s): 5.46 | learning rate: 1.260E-05 | global batch size: 512 | lm loss: 4.881182E+00 | loss scale: 262144.0 | grad norm: 0.093 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.832 | TFLOPs: 70.87 | +[default7]: iteration 3017/ 3814 | consumed samples: 1544704 | consumed tokens: 3163553792 | elapsed time per iteration (s): 5.46 | learning rate: 1.257E-05 | global batch size: 512 | lm loss: 4.892162E+00 | loss scale: 262144.0 | grad norm: 0.092 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.751 | TFLOPs: 70.81 | +[default7]: iteration 3018/ 3814 | consumed samples: 1545216 | consumed tokens: 3164602368 | elapsed time per iteration (s): 5.48 | learning rate: 1.254E-05 | global batch size: 512 | lm loss: 4.875088E+00 | loss scale: 262144.0 | grad norm: 0.097 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.442 | TFLOPs: 70.58 | +[default7]: iteration 3019/ 3814 | consumed samples: 1545728 | consumed tokens: 3165650944 | elapsed time per iteration (s): 5.45 | learning rate: 1.251E-05 | global batch size: 512 | lm loss: 4.892815E+00 | loss scale: 262144.0 | grad norm: 0.089 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.005 | TFLOPs: 71.00 | +[default7]: iteration 3020/ 3814 | consumed samples: 1546240 | consumed tokens: 3166699520 | elapsed time per iteration (s): 5.45 | learning rate: 1.248E-05 | global batch size: 512 | lm loss: 4.907596E+00 | loss scale: 262144.0 | grad norm: 0.097 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.982 | TFLOPs: 70.99 | +[default7]: iteration 3021/ 3814 | consumed samples: 1546752 | consumed tokens: 3167748096 | elapsed time per iteration (s): 5.45 | learning rate: 1.245E-05 | global batch size: 512 | lm loss: 4.915272E+00 | loss scale: 262144.0 | grad norm: 0.095 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.966 | TFLOPs: 70.97 | +[default7]: iteration 3022/ 3814 | consumed samples: 1547264 | consumed tokens: 3168796672 | elapsed time per iteration (s): 5.43 | learning rate: 1.242E-05 | global batch size: 512 | lm loss: 4.911263E+00 | loss scale: 262144.0 | grad norm: 0.094 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.255 | TFLOPs: 71.19 | +[default7]: iteration 3023/ 3814 | consumed samples: 1547776 | consumed tokens: 3169845248 | elapsed time per iteration (s): 5.42 | learning rate: 1.239E-05 | global batch size: 512 | lm loss: 4.910518E+00 | loss scale: 262144.0 | grad norm: 0.102 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.456 | TFLOPs: 71.34 | +[default7]: iteration 3024/ 3814 | consumed samples: 1548288 | consumed tokens: 3170893824 | elapsed time per iteration (s): 5.43 | learning rate: 1.236E-05 | global batch size: 512 | lm loss: 4.930357E+00 | loss scale: 262144.0 | grad norm: 0.092 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.270 | TFLOPs: 71.20 | +[default7]: iteration 3025/ 3814 | consumed samples: 1548800 | consumed tokens: 3171942400 | elapsed time per iteration (s): 5.43 | learning rate: 1.233E-05 | global batch size: 512 | lm loss: 4.900331E+00 | loss scale: 262144.0 | grad norm: 0.106 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.357 | TFLOPs: 71.27 | +[default7]: iteration 3026/ 3814 | consumed samples: 1549312 | consumed tokens: 3172990976 | elapsed time per iteration (s): 5.45 | learning rate: 1.230E-05 | global batch size: 512 | lm loss: 4.926271E+00 | loss scale: 262144.0 | grad norm: 0.090 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.871 | TFLOPs: 70.90 | +[default7]: iteration 3027/ 3814 | consumed samples: 1549824 | consumed tokens: 3174039552 | elapsed time per iteration (s): 5.44 | learning rate: 1.227E-05 | global batch size: 512 | lm loss: 4.914916E+00 | loss scale: 262144.0 | grad norm: 0.105 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.110 | TFLOPs: 71.08 | +[default7]: iteration 3028/ 3814 | consumed samples: 1550336 | consumed tokens: 3175088128 | elapsed time per iteration (s): 5.42 | learning rate: 1.224E-05 | global batch size: 512 | lm loss: 4.925916E+00 | loss scale: 262144.0 | grad norm: 0.100 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.418 | TFLOPs: 71.31 | +[default7]: iteration 3029/ 3814 | consumed samples: 1550848 | consumed tokens: 3176136704 | elapsed time per iteration (s): 5.45 | learning rate: 1.221E-05 | global batch size: 512 | lm loss: 4.896079E+00 | loss scale: 262144.0 | grad norm: 0.111 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.917 | TFLOPs: 70.94 | +[default7]: iteration 3030/ 3814 | consumed samples: 1551360 | consumed tokens: 3177185280 | elapsed time per iteration (s): 5.45 | learning rate: 1.218E-05 | global batch size: 512 | lm loss: 4.900238E+00 | loss scale: 262144.0 | grad norm: 0.101 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.902 | TFLOPs: 70.93 | +[default7]: iteration 3031/ 3814 | consumed samples: 1551872 | consumed tokens: 3178233856 | elapsed time per iteration (s): 5.44 | learning rate: 1.215E-05 | global batch size: 512 | lm loss: 4.914775E+00 | loss scale: 262144.0 | grad norm: 0.090 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.116 | TFLOPs: 71.09 | +[default7]: iteration 3032/ 3814 | consumed samples: 1552384 | consumed tokens: 3179282432 | elapsed time per iteration (s): 5.43 | learning rate: 1.212E-05 | global batch size: 512 | lm loss: 4.885135E+00 | loss scale: 262144.0 | grad norm: 0.108 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.213 | TFLOPs: 71.16 | +[default7]: iteration 3033/ 3814 | consumed samples: 1552896 | consumed tokens: 3180331008 | elapsed time per iteration (s): 5.45 | learning rate: 1.209E-05 | global batch size: 512 | lm loss: 4.949057E+00 | loss scale: 262144.0 | grad norm: 0.094 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.000 | TFLOPs: 71.00 | +[default7]: iteration 3034/ 3814 | consumed samples: 1553408 | consumed tokens: 3181379584 | elapsed time per iteration (s): 5.43 | learning rate: 1.206E-05 | global batch size: 512 | lm loss: 4.882306E+00 | loss scale: 262144.0 | grad norm: 0.106 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.248 | TFLOPs: 71.19 | +[default7]: iteration 3035/ 3814 | consumed samples: 1553920 | consumed tokens: 3182428160 | elapsed time per iteration (s): 5.43 | learning rate: 1.203E-05 | global batch size: 512 | lm loss: 4.897512E+00 | loss scale: 262144.0 | grad norm: 0.082 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.257 | TFLOPs: 71.19 | +[default7]: iteration 3036/ 3814 | consumed samples: 1554432 | consumed tokens: 3183476736 | elapsed time per iteration (s): 5.44 | learning rate: 1.200E-05 | global batch size: 512 | lm loss: 4.898931E+00 | loss scale: 262144.0 | grad norm: 0.094 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.123 | TFLOPs: 71.09 | +[default7]: iteration 3037/ 3814 | consumed samples: 1554944 | consumed tokens: 3184525312 | elapsed time per iteration (s): 5.45 | learning rate: 1.198E-05 | global batch size: 512 | lm loss: 4.891583E+00 | loss scale: 262144.0 | grad norm: 0.086 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.988 | TFLOPs: 70.99 | +[default7]: iteration 3038/ 3814 | consumed samples: 1555456 | consumed tokens: 3185573888 | elapsed time per iteration (s): 5.45 | learning rate: 1.195E-05 | global batch size: 512 | lm loss: 4.889398E+00 | loss scale: 262144.0 | grad norm: 0.095 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.939 | TFLOPs: 70.95 | +[default7]: iteration 3039/ 3814 | consumed samples: 1555968 | consumed tokens: 3186622464 | elapsed time per iteration (s): 5.45 | learning rate: 1.192E-05 | global batch size: 512 | lm loss: 4.908161E+00 | loss scale: 262144.0 | grad norm: 0.089 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.025 | TFLOPs: 71.02 | +[default7]: iteration 3040/ 3814 | consumed samples: 1556480 | consumed tokens: 3187671040 | elapsed time per iteration (s): 5.44 | learning rate: 1.189E-05 | global batch size: 512 | lm loss: 4.898180E+00 | loss scale: 262144.0 | grad norm: 0.095 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.180 | TFLOPs: 71.14 | +[default7]: iteration 3041/ 3814 | consumed samples: 1556992 | consumed tokens: 3188719616 | elapsed time per iteration (s): 5.45 | learning rate: 1.186E-05 | global batch size: 512 | lm loss: 4.919466E+00 | loss scale: 262144.0 | grad norm: 0.098 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.019 | TFLOPs: 71.01 | +[default7]: iteration 3042/ 3814 | consumed samples: 1557504 | consumed tokens: 3189768192 | elapsed time per iteration (s): 5.48 | learning rate: 1.183E-05 | global batch size: 512 | lm loss: 4.878886E+00 | loss scale: 262144.0 | grad norm: 0.094 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.465 | TFLOPs: 70.60 | +[default7]: iteration 3043/ 3814 | consumed samples: 1558016 | consumed tokens: 3190816768 | elapsed time per iteration (s): 5.45 | learning rate: 1.180E-05 | global batch size: 512 | lm loss: 4.904360E+00 | loss scale: 262144.0 | grad norm: 0.086 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.882 | TFLOPs: 70.91 | +[default7]: iteration 3044/ 3814 | consumed samples: 1558528 | consumed tokens: 3191865344 | elapsed time per iteration (s): 5.45 | learning rate: 1.177E-05 | global batch size: 512 | lm loss: 4.878235E+00 | loss scale: 262144.0 | grad norm: 0.097 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.880 | TFLOPs: 70.91 | +[default7]: iteration 3045/ 3814 | consumed samples: 1559040 | consumed tokens: 3192913920 | elapsed time per iteration (s): 5.45 | learning rate: 1.174E-05 | global batch size: 512 | lm loss: 4.914328E+00 | loss scale: 262144.0 | grad norm: 0.087 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.012 | TFLOPs: 71.01 | +[default7]: iteration 3046/ 3814 | consumed samples: 1559552 | consumed tokens: 3193962496 | elapsed time per iteration (s): 5.45 | learning rate: 1.171E-05 | global batch size: 512 | lm loss: 4.913244E+00 | loss scale: 262144.0 | grad norm: 0.105 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.972 | TFLOPs: 70.98 | +[default7]: iteration 3047/ 3814 | consumed samples: 1560064 | consumed tokens: 3195011072 | elapsed time per iteration (s): 5.50 | learning rate: 1.168E-05 | global batch size: 512 | lm loss: 4.912503E+00 | loss scale: 262144.0 | grad norm: 0.099 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.126 | TFLOPs: 70.34 | +[default7]: iteration 3048/ 3814 | consumed samples: 1560576 | consumed tokens: 3196059648 | elapsed time per iteration (s): 5.48 | learning rate: 1.165E-05 | global batch size: 512 | lm loss: 4.901104E+00 | loss scale: 262144.0 | grad norm: 0.090 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.484 | TFLOPs: 70.61 | +[default7]: iteration 3049/ 3814 | consumed samples: 1561088 | consumed tokens: 3197108224 | elapsed time per iteration (s): 5.46 | learning rate: 1.162E-05 | global batch size: 512 | lm loss: 4.906130E+00 | loss scale: 262144.0 | grad norm: 0.105 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.709 | TFLOPs: 70.78 | +[default7]: iteration 3050/ 3814 | consumed samples: 1561600 | consumed tokens: 3198156800 | elapsed time per iteration (s): 5.45 | learning rate: 1.159E-05 | global batch size: 512 | lm loss: 4.883965E+00 | loss scale: 262144.0 | grad norm: 0.092 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.957 | TFLOPs: 70.97 | +[default7]: iteration 3051/ 3814 | consumed samples: 1562112 | consumed tokens: 3199205376 | elapsed time per iteration (s): 5.45 | learning rate: 1.157E-05 | global batch size: 512 | lm loss: 4.915517E+00 | loss scale: 262144.0 | grad norm: 0.120 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.018 | TFLOPs: 71.01 | +[default7]: iteration 3052/ 3814 | consumed samples: 1562624 | consumed tokens: 3200253952 | elapsed time per iteration (s): 5.47 | learning rate: 1.154E-05 | global batch size: 512 | lm loss: 4.915502E+00 | loss scale: 262144.0 | grad norm: 0.094 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.620 | TFLOPs: 70.71 | +[default7]: iteration 3053/ 3814 | consumed samples: 1563136 | consumed tokens: 3201302528 | elapsed time per iteration (s): 5.46 | learning rate: 1.151E-05 | global batch size: 512 | lm loss: 4.917213E+00 | loss scale: 262144.0 | grad norm: 0.109 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.845 | TFLOPs: 70.88 | +[default7]: iteration 3054/ 3814 | consumed samples: 1563648 | consumed tokens: 3202351104 | elapsed time per iteration (s): 5.45 | learning rate: 1.148E-05 | global batch size: 512 | lm loss: 4.904161E+00 | loss scale: 262144.0 | grad norm: 0.122 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.930 | TFLOPs: 70.95 | +[default7]: iteration 3055/ 3814 | consumed samples: 1564160 | consumed tokens: 3203399680 | elapsed time per iteration (s): 5.45 | learning rate: 1.145E-05 | global batch size: 512 | lm loss: 4.881147E+00 | loss scale: 262144.0 | grad norm: 0.097 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.882 | TFLOPs: 70.91 | +[default7]: iteration 3056/ 3814 | consumed samples: 1564672 | consumed tokens: 3204448256 | elapsed time per iteration (s): 5.46 | learning rate: 1.142E-05 | global batch size: 512 | lm loss: 4.909692E+00 | loss scale: 262144.0 | grad norm: 0.104 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.713 | TFLOPs: 70.78 | +[default7]: iteration 3057/ 3814 | consumed samples: 1565184 | consumed tokens: 3205496832 | elapsed time per iteration (s): 5.45 | learning rate: 1.139E-05 | global batch size: 512 | lm loss: 4.890395E+00 | loss scale: 262144.0 | grad norm: 0.095 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.922 | TFLOPs: 70.94 | +[default7]: iteration 3058/ 3814 | consumed samples: 1565696 | consumed tokens: 3206545408 | elapsed time per iteration (s): 5.44 | learning rate: 1.136E-05 | global batch size: 512 | lm loss: 4.912347E+00 | loss scale: 262144.0 | grad norm: 0.096 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.073 | TFLOPs: 71.05 | +[default7]: iteration 3059/ 3814 | consumed samples: 1566208 | consumed tokens: 3207593984 | elapsed time per iteration (s): 5.44 | learning rate: 1.133E-05 | global batch size: 512 | lm loss: 4.913183E+00 | loss scale: 262144.0 | grad norm: 0.093 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.175 | TFLOPs: 71.13 | +[default7]: iteration 3060/ 3814 | consumed samples: 1566720 | consumed tokens: 3208642560 | elapsed time per iteration (s): 5.43 | learning rate: 1.131E-05 | global batch size: 512 | lm loss: 4.907928E+00 | loss scale: 262144.0 | grad norm: 0.105 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.257 | TFLOPs: 71.19 | +[default7]: iteration 3061/ 3814 | consumed samples: 1567232 | consumed tokens: 3209691136 | elapsed time per iteration (s): 5.44 | learning rate: 1.128E-05 | global batch size: 512 | lm loss: 4.895776E+00 | loss scale: 262144.0 | grad norm: 0.096 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.119 | TFLOPs: 71.09 | +[default7]: iteration 3062/ 3814 | consumed samples: 1567744 | consumed tokens: 3210739712 | elapsed time per iteration (s): 5.44 | learning rate: 1.125E-05 | global batch size: 512 | lm loss: 4.881444E+00 | loss scale: 262144.0 | grad norm: 0.094 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.080 | TFLOPs: 71.06 | +[default7]: iteration 3063/ 3814 | consumed samples: 1568256 | consumed tokens: 3211788288 | elapsed time per iteration (s): 5.46 | learning rate: 1.122E-05 | global batch size: 512 | lm loss: 4.877427E+00 | loss scale: 262144.0 | grad norm: 0.087 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.770 | TFLOPs: 70.83 | +[default7]: iteration 3064/ 3814 | consumed samples: 1568768 | consumed tokens: 3212836864 | elapsed time per iteration (s): 5.46 | learning rate: 1.119E-05 | global batch size: 512 | lm loss: 4.910019E+00 | loss scale: 262144.0 | grad norm: 0.108 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.710 | TFLOPs: 70.78 | +[default7]: iteration 3065/ 3814 | consumed samples: 1569280 | consumed tokens: 3213885440 | elapsed time per iteration (s): 5.49 | learning rate: 1.116E-05 | global batch size: 512 | lm loss: 4.883306E+00 | loss scale: 262144.0 | grad norm: 0.099 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.191 | TFLOPs: 70.39 | +[default7]: iteration 3066/ 3814 | consumed samples: 1569792 | consumed tokens: 3214934016 | elapsed time per iteration (s): 5.44 | learning rate: 1.113E-05 | global batch size: 512 | lm loss: 4.903257E+00 | loss scale: 262144.0 | grad norm: 0.090 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.094 | TFLOPs: 71.07 | +[default7]: iteration 3067/ 3814 | consumed samples: 1570304 | consumed tokens: 3215982592 | elapsed time per iteration (s): 5.45 | learning rate: 1.110E-05 | global batch size: 512 | lm loss: 4.902945E+00 | loss scale: 262144.0 | grad norm: 0.085 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.965 | TFLOPs: 70.97 | +[default7]: iteration 3068/ 3814 | consumed samples: 1570816 | consumed tokens: 3217031168 | elapsed time per iteration (s): 5.44 | learning rate: 1.108E-05 | global batch size: 512 | lm loss: 4.922894E+00 | loss scale: 262144.0 | grad norm: 0.089 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.048 | TFLOPs: 71.04 | +[default7]: iteration 3069/ 3814 | consumed samples: 1571328 | consumed tokens: 3218079744 | elapsed time per iteration (s): 5.45 | learning rate: 1.105E-05 | global batch size: 512 | lm loss: 4.890932E+00 | loss scale: 262144.0 | grad norm: 0.095 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.925 | TFLOPs: 70.94 | +[default7]: iteration 3070/ 3814 | consumed samples: 1571840 | consumed tokens: 3219128320 | elapsed time per iteration (s): 5.45 | learning rate: 1.102E-05 | global batch size: 512 | lm loss: 4.911865E+00 | loss scale: 262144.0 | grad norm: 0.096 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.976 | TFLOPs: 70.98 | +[default7]: iteration 3071/ 3814 | consumed samples: 1572352 | consumed tokens: 3220176896 | elapsed time per iteration (s): 5.44 | learning rate: 1.099E-05 | global batch size: 512 | lm loss: 4.907896E+00 | loss scale: 262144.0 | grad norm: 0.097 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.128 | TFLOPs: 71.10 | +[default7]: iteration 3072/ 3814 | consumed samples: 1572864 | consumed tokens: 3221225472 | elapsed time per iteration (s): 5.44 | learning rate: 1.096E-05 | global batch size: 512 | lm loss: 4.861389E+00 | loss scale: 262144.0 | grad norm: 0.094 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.088 | TFLOPs: 71.07 | +[default7]: iteration 3073/ 3814 | consumed samples: 1573376 | consumed tokens: 3222274048 | elapsed time per iteration (s): 5.45 | learning rate: 1.093E-05 | global batch size: 512 | lm loss: 4.894814E+00 | loss scale: 262144.0 | grad norm: 0.090 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.906 | TFLOPs: 70.93 | +[default7]: iteration 3074/ 3814 | consumed samples: 1573888 | consumed tokens: 3223322624 | elapsed time per iteration (s): 5.47 | learning rate: 1.091E-05 | global batch size: 512 | lm loss: 4.896415E+00 | loss scale: 262144.0 | grad norm: 0.093 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.532 | TFLOPs: 70.65 | +[default7]: iteration 3075/ 3814 | consumed samples: 1574400 | consumed tokens: 3224371200 | elapsed time per iteration (s): 5.46 | learning rate: 1.088E-05 | global batch size: 512 | lm loss: 4.884424E+00 | loss scale: 262144.0 | grad norm: 0.082 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.720 | TFLOPs: 70.79 | +[default7]: iteration 3076/ 3814 | consumed samples: 1574912 | consumed tokens: 3225419776 | elapsed time per iteration (s): 5.46 | learning rate: 1.085E-05 | global batch size: 512 | lm loss: 4.891894E+00 | loss scale: 262144.0 | grad norm: 0.090 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.694 | TFLOPs: 70.77 | +[default7]: iteration 3077/ 3814 | consumed samples: 1575424 | consumed tokens: 3226468352 | elapsed time per iteration (s): 5.49 | learning rate: 1.082E-05 | global batch size: 512 | lm loss: 4.904089E+00 | loss scale: 262144.0 | grad norm: 0.082 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.280 | TFLOPs: 70.46 | +[default7]: iteration 3078/ 3814 | consumed samples: 1575936 | consumed tokens: 3227516928 | elapsed time per iteration (s): 5.45 | learning rate: 1.079E-05 | global batch size: 512 | lm loss: 4.904779E+00 | loss scale: 262144.0 | grad norm: 0.094 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.995 | TFLOPs: 71.00 | +[default7]: iteration 3079/ 3814 | consumed samples: 1576448 | consumed tokens: 3228565504 | elapsed time per iteration (s): 5.46 | learning rate: 1.076E-05 | global batch size: 512 | lm loss: 4.888978E+00 | loss scale: 262144.0 | grad norm: 0.088 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.832 | TFLOPs: 70.87 | +[default7]: iteration 3080/ 3814 | consumed samples: 1576960 | consumed tokens: 3229614080 | elapsed time per iteration (s): 5.47 | learning rate: 1.074E-05 | global batch size: 512 | lm loss: 4.902528E+00 | loss scale: 262144.0 | grad norm: 0.095 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.607 | TFLOPs: 70.70 | +[default7]: iteration 3081/ 3814 | consumed samples: 1577472 | consumed tokens: 3230662656 | elapsed time per iteration (s): 5.47 | learning rate: 1.071E-05 | global batch size: 512 | lm loss: 4.892663E+00 | loss scale: 262144.0 | grad norm: 0.094 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.639 | TFLOPs: 70.73 | +[default7]: iteration 3082/ 3814 | consumed samples: 1577984 | consumed tokens: 3231711232 | elapsed time per iteration (s): 5.45 | learning rate: 1.068E-05 | global batch size: 512 | lm loss: 4.907976E+00 | loss scale: 262144.0 | grad norm: 0.100 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.975 | TFLOPs: 70.98 | +[default7]: iteration 3083/ 3814 | consumed samples: 1578496 | consumed tokens: 3232759808 | elapsed time per iteration (s): 5.46 | learning rate: 1.065E-05 | global batch size: 512 | lm loss: 4.895464E+00 | loss scale: 262144.0 | grad norm: 0.092 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.831 | TFLOPs: 70.87 | +[default7]: iteration 3084/ 3814 | consumed samples: 1579008 | consumed tokens: 3233808384 | elapsed time per iteration (s): 5.45 | learning rate: 1.062E-05 | global batch size: 512 | lm loss: 4.889454E+00 | loss scale: 262144.0 | grad norm: 0.087 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.962 | TFLOPs: 70.97 | +[default7]: iteration 3085/ 3814 | consumed samples: 1579520 | consumed tokens: 3234856960 | elapsed time per iteration (s): 5.50 | learning rate: 1.060E-05 | global batch size: 512 | lm loss: 4.917595E+00 | loss scale: 262144.0 | grad norm: 0.103 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.118 | TFLOPs: 70.33 | +[default7]: iteration 3086/ 3814 | consumed samples: 1580032 | consumed tokens: 3235905536 | elapsed time per iteration (s): 5.45 | learning rate: 1.057E-05 | global batch size: 512 | lm loss: 4.888105E+00 | loss scale: 262144.0 | grad norm: 0.092 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.005 | TFLOPs: 71.00 | +[default7]: iteration 3087/ 3814 | consumed samples: 1580544 | consumed tokens: 3236954112 | elapsed time per iteration (s): 5.45 | learning rate: 1.054E-05 | global batch size: 512 | lm loss: 4.909908E+00 | loss scale: 262144.0 | grad norm: 0.104 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.957 | TFLOPs: 70.97 | +[default7]: iteration 3088/ 3814 | consumed samples: 1581056 | consumed tokens: 3238002688 | elapsed time per iteration (s): 5.44 | learning rate: 1.051E-05 | global batch size: 512 | lm loss: 4.904099E+00 | loss scale: 262144.0 | grad norm: 0.087 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.099 | TFLOPs: 71.07 | +[default7]: iteration 3089/ 3814 | consumed samples: 1581568 | consumed tokens: 3239051264 | elapsed time per iteration (s): 5.45 | learning rate: 1.048E-05 | global batch size: 512 | lm loss: 4.896671E+00 | loss scale: 262144.0 | grad norm: 0.088 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.944 | TFLOPs: 70.96 | +[default7]: iteration 3090/ 3814 | consumed samples: 1582080 | consumed tokens: 3240099840 | elapsed time per iteration (s): 5.44 | learning rate: 1.046E-05 | global batch size: 512 | lm loss: 4.911400E+00 | loss scale: 262144.0 | grad norm: 0.093 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.139 | TFLOPs: 71.10 | +[default7]: iteration 3091/ 3814 | consumed samples: 1582592 | consumed tokens: 3241148416 | elapsed time per iteration (s): 5.43 | learning rate: 1.043E-05 | global batch size: 512 | lm loss: 4.876360E+00 | loss scale: 262144.0 | grad norm: 0.096 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.206 | TFLOPs: 71.15 | +[default7]: iteration 3092/ 3814 | consumed samples: 1583104 | consumed tokens: 3242196992 | elapsed time per iteration (s): 5.43 | learning rate: 1.040E-05 | global batch size: 512 | lm loss: 4.895341E+00 | loss scale: 262144.0 | grad norm: 0.092 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.369 | TFLOPs: 71.28 | +[default7]: iteration 3093/ 3814 | consumed samples: 1583616 | consumed tokens: 3243245568 | elapsed time per iteration (s): 5.42 | learning rate: 1.037E-05 | global batch size: 512 | lm loss: 4.892171E+00 | loss scale: 262144.0 | grad norm: 0.104 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.382 | TFLOPs: 71.29 | +[default7]: iteration 3094/ 3814 | consumed samples: 1584128 | consumed tokens: 3244294144 | elapsed time per iteration (s): 5.43 | learning rate: 1.035E-05 | global batch size: 512 | lm loss: 4.887984E+00 | loss scale: 262144.0 | grad norm: 0.086 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.271 | TFLOPs: 71.20 | +[default7]: iteration 3095/ 3814 | consumed samples: 1584640 | consumed tokens: 3245342720 | elapsed time per iteration (s): 5.44 | learning rate: 1.032E-05 | global batch size: 512 | lm loss: 4.883910E+00 | loss scale: 262144.0 | grad norm: 0.099 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.083 | TFLOPs: 71.06 | +[default7]: iteration 3096/ 3814 | consumed samples: 1585152 | consumed tokens: 3246391296 | elapsed time per iteration (s): 5.45 | learning rate: 1.029E-05 | global batch size: 512 | lm loss: 4.905190E+00 | loss scale: 262144.0 | grad norm: 0.090 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.976 | TFLOPs: 70.98 | +[default7]: iteration 3097/ 3814 | consumed samples: 1585664 | consumed tokens: 3247439872 | elapsed time per iteration (s): 5.45 | learning rate: 1.026E-05 | global batch size: 512 | lm loss: 4.897444E+00 | loss scale: 262144.0 | grad norm: 0.096 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.940 | TFLOPs: 70.95 | +[default7]: iteration 3098/ 3814 | consumed samples: 1586176 | consumed tokens: 3248488448 | elapsed time per iteration (s): 5.44 | learning rate: 1.024E-05 | global batch size: 512 | lm loss: 4.892115E+00 | loss scale: 262144.0 | grad norm: 0.103 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.178 | TFLOPs: 71.13 | +[default7]: iteration 3099/ 3814 | consumed samples: 1586688 | consumed tokens: 3249537024 | elapsed time per iteration (s): 5.46 | learning rate: 1.021E-05 | global batch size: 512 | lm loss: 4.911932E+00 | loss scale: 262144.0 | grad norm: 0.095 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.733 | TFLOPs: 70.80 | +[default7]: iteration 3100/ 3814 | consumed samples: 1587200 | consumed tokens: 3250585600 | elapsed time per iteration (s): 5.44 | learning rate: 1.018E-05 | global batch size: 512 | lm loss: 4.896057E+00 | loss scale: 262144.0 | grad norm: 0.089 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.123 | TFLOPs: 71.09 | +[default7]: iteration 3101/ 3814 | consumed samples: 1587712 | consumed tokens: 3251634176 | elapsed time per iteration (s): 5.44 | learning rate: 1.015E-05 | global batch size: 512 | lm loss: 4.896327E+00 | loss scale: 262144.0 | grad norm: 0.118 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.105 | TFLOPs: 71.08 | +[default7]: iteration 3102/ 3814 | consumed samples: 1588224 | consumed tokens: 3252682752 | elapsed time per iteration (s): 5.49 | learning rate: 1.013E-05 | global batch size: 512 | lm loss: 4.897120E+00 | loss scale: 262144.0 | grad norm: 0.102 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.335 | TFLOPs: 70.50 | +[default7]: iteration 3103/ 3814 | consumed samples: 1588736 | consumed tokens: 3253731328 | elapsed time per iteration (s): 5.46 | learning rate: 1.010E-05 | global batch size: 512 | lm loss: 4.895566E+00 | loss scale: 262144.0 | grad norm: 0.111 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.692 | TFLOPs: 70.77 | +[default7]: iteration 3104/ 3814 | consumed samples: 1589248 | consumed tokens: 3254779904 | elapsed time per iteration (s): 5.44 | learning rate: 1.007E-05 | global batch size: 512 | lm loss: 4.902483E+00 | loss scale: 262144.0 | grad norm: 0.087 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.200 | TFLOPs: 71.15 | +[default7]: iteration 3105/ 3814 | consumed samples: 1589760 | consumed tokens: 3255828480 | elapsed time per iteration (s): 5.43 | learning rate: 1.004E-05 | global batch size: 512 | lm loss: 4.909990E+00 | loss scale: 262144.0 | grad norm: 0.138 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.267 | TFLOPs: 71.20 | +[default7]: iteration 3106/ 3814 | consumed samples: 1590272 | consumed tokens: 3256877056 | elapsed time per iteration (s): 5.44 | learning rate: 1.002E-05 | global batch size: 512 | lm loss: 4.924037E+00 | loss scale: 262144.0 | grad norm: 0.097 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.052 | TFLOPs: 71.04 | +[default7]: iteration 3107/ 3814 | consumed samples: 1590784 | consumed tokens: 3257925632 | elapsed time per iteration (s): 5.44 | learning rate: 9.988E-06 | global batch size: 512 | lm loss: 4.887387E+00 | loss scale: 262144.0 | grad norm: 0.117 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.189 | TFLOPs: 71.14 | +[default7]: iteration 3108/ 3814 | consumed samples: 1591296 | consumed tokens: 3258974208 | elapsed time per iteration (s): 5.44 | learning rate: 9.961E-06 | global batch size: 512 | lm loss: 4.893311E+00 | loss scale: 262144.0 | grad norm: 0.093 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.081 | TFLOPs: 71.06 | +[default7]: iteration 3109/ 3814 | consumed samples: 1591808 | consumed tokens: 3260022784 | elapsed time per iteration (s): 5.45 | learning rate: 9.934E-06 | global batch size: 512 | lm loss: 4.880699E+00 | loss scale: 262144.0 | grad norm: 0.123 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.021 | TFLOPs: 71.02 | +[default7]: iteration 3110/ 3814 | consumed samples: 1592320 | consumed tokens: 3261071360 | elapsed time per iteration (s): 5.44 | learning rate: 9.907E-06 | global batch size: 512 | lm loss: 4.867469E+00 | loss scale: 262144.0 | grad norm: 0.095 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.135 | TFLOPs: 71.10 | +[default7]: iteration 3111/ 3814 | consumed samples: 1592832 | consumed tokens: 3262119936 | elapsed time per iteration (s): 5.44 | learning rate: 9.880E-06 | global batch size: 512 | lm loss: 4.892900E+00 | loss scale: 262144.0 | grad norm: 0.097 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.097 | TFLOPs: 71.07 | +[default7]: iteration 3112/ 3814 | consumed samples: 1593344 | consumed tokens: 3263168512 | elapsed time per iteration (s): 5.44 | learning rate: 9.853E-06 | global batch size: 512 | lm loss: 4.900352E+00 | loss scale: 262144.0 | grad norm: 0.118 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.202 | TFLOPs: 71.15 | +[default7]: iteration 3113/ 3814 | consumed samples: 1593856 | consumed tokens: 3264217088 | elapsed time per iteration (s): 5.45 | learning rate: 9.826E-06 | global batch size: 512 | lm loss: 4.887175E+00 | loss scale: 262144.0 | grad norm: 0.098 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.012 | TFLOPs: 71.01 | +[default7]: iteration 3114/ 3814 | consumed samples: 1594368 | consumed tokens: 3265265664 | elapsed time per iteration (s): 5.41 | learning rate: 9.799E-06 | global batch size: 512 | lm loss: 4.890293E+00 | loss scale: 262144.0 | grad norm: 0.141 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.632 | TFLOPs: 71.48 | +[default7]: iteration 3115/ 3814 | consumed samples: 1594880 | consumed tokens: 3266314240 | elapsed time per iteration (s): 5.43 | learning rate: 9.772E-06 | global batch size: 512 | lm loss: 4.891609E+00 | loss scale: 262144.0 | grad norm: 0.106 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.357 | TFLOPs: 71.27 | +[default7]: iteration 3116/ 3814 | consumed samples: 1595392 | consumed tokens: 3267362816 | elapsed time per iteration (s): 5.44 | learning rate: 9.745E-06 | global batch size: 512 | lm loss: 4.888391E+00 | loss scale: 262144.0 | grad norm: 0.102 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.101 | TFLOPs: 71.08 | +[default7]: iteration 3117/ 3814 | consumed samples: 1595904 | consumed tokens: 3268411392 | elapsed time per iteration (s): 5.42 | learning rate: 9.718E-06 | global batch size: 512 | lm loss: 4.859272E+00 | loss scale: 262144.0 | grad norm: 0.104 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.431 | TFLOPs: 71.32 | +[default7]: iteration 3118/ 3814 | consumed samples: 1596416 | consumed tokens: 3269459968 | elapsed time per iteration (s): 5.44 | learning rate: 9.691E-06 | global batch size: 512 | lm loss: 4.926496E+00 | loss scale: 262144.0 | grad norm: 0.129 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.154 | TFLOPs: 71.12 | +[default7]: iteration 3119/ 3814 | consumed samples: 1596928 | consumed tokens: 3270508544 | elapsed time per iteration (s): 5.42 | learning rate: 9.664E-06 | global batch size: 512 | lm loss: 4.918703E+00 | loss scale: 262144.0 | grad norm: 0.110 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.497 | TFLOPs: 71.37 | +[default7]: iteration 3120/ 3814 | consumed samples: 1597440 | consumed tokens: 3271557120 | elapsed time per iteration (s): 5.43 | learning rate: 9.637E-06 | global batch size: 512 | lm loss: 4.892521E+00 | loss scale: 262144.0 | grad norm: 0.102 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.364 | TFLOPs: 71.27 | +[default7]: iteration 3121/ 3814 | consumed samples: 1597952 | consumed tokens: 3272605696 | elapsed time per iteration (s): 5.44 | learning rate: 9.610E-06 | global batch size: 512 | lm loss: 4.885956E+00 | loss scale: 262144.0 | grad norm: 0.118 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.147 | TFLOPs: 71.11 | +[default7]: iteration 3122/ 3814 | consumed samples: 1598464 | consumed tokens: 3273654272 | elapsed time per iteration (s): 5.43 | learning rate: 9.583E-06 | global batch size: 512 | lm loss: 4.896502E+00 | loss scale: 262144.0 | grad norm: 0.095 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.290 | TFLOPs: 71.22 | +[default7]: iteration 3123/ 3814 | consumed samples: 1598976 | consumed tokens: 3274702848 | elapsed time per iteration (s): 5.45 | learning rate: 9.557E-06 | global batch size: 512 | lm loss: 4.931229E+00 | loss scale: 262144.0 | grad norm: 0.110 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.948 | TFLOPs: 70.96 | +[default7]: iteration 3124/ 3814 | consumed samples: 1599488 | consumed tokens: 3275751424 | elapsed time per iteration (s): 5.43 | learning rate: 9.530E-06 | global batch size: 512 | lm loss: 4.882224E+00 | loss scale: 262144.0 | grad norm: 0.099 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.329 | TFLOPs: 71.25 | +[default7]: iteration 3125/ 3814 | consumed samples: 1600000 | consumed tokens: 3276800000 | elapsed time per iteration (s): 5.44 | learning rate: 9.503E-06 | global batch size: 512 | lm loss: 4.891042E+00 | loss scale: 262144.0 | grad norm: 0.102 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.143 | TFLOPs: 71.11 | +[default7]: iteration 3126/ 3814 | consumed samples: 1600512 | consumed tokens: 3277848576 | elapsed time per iteration (s): 5.46 | learning rate: 9.477E-06 | global batch size: 512 | lm loss: 4.885410E+00 | loss scale: 262144.0 | grad norm: 0.104 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.818 | TFLOPs: 70.86 | +[default7]: iteration 3127/ 3814 | consumed samples: 1601024 | consumed tokens: 3278897152 | elapsed time per iteration (s): 5.45 | learning rate: 9.450E-06 | global batch size: 512 | lm loss: 4.885213E+00 | loss scale: 262144.0 | grad norm: 0.103 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.893 | TFLOPs: 70.92 | +[default7]: iteration 3128/ 3814 | consumed samples: 1601536 | consumed tokens: 3279945728 | elapsed time per iteration (s): 5.46 | learning rate: 9.424E-06 | global batch size: 512 | lm loss: 4.877335E+00 | loss scale: 262144.0 | grad norm: 0.098 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.808 | TFLOPs: 70.85 | +[default7]: iteration 3129/ 3814 | consumed samples: 1602048 | consumed tokens: 3280994304 | elapsed time per iteration (s): 5.48 | learning rate: 9.397E-06 | global batch size: 512 | lm loss: 4.885622E+00 | loss scale: 262144.0 | grad norm: 0.104 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.459 | TFLOPs: 70.59 | +[default7]: iteration 3130/ 3814 | consumed samples: 1602560 | consumed tokens: 3282042880 | elapsed time per iteration (s): 5.46 | learning rate: 9.371E-06 | global batch size: 512 | lm loss: 4.867461E+00 | loss scale: 262144.0 | grad norm: 0.107 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.833 | TFLOPs: 70.87 | +[default7]: iteration 3131/ 3814 | consumed samples: 1603072 | consumed tokens: 3283091456 | elapsed time per iteration (s): 5.46 | learning rate: 9.344E-06 | global batch size: 512 | lm loss: 4.898586E+00 | loss scale: 262144.0 | grad norm: 0.101 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.746 | TFLOPs: 70.81 | +[default7]: iteration 3132/ 3814 | consumed samples: 1603584 | consumed tokens: 3284140032 | elapsed time per iteration (s): 5.44 | learning rate: 9.318E-06 | global batch size: 512 | lm loss: 4.887995E+00 | loss scale: 262144.0 | grad norm: 0.102 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.101 | TFLOPs: 71.08 | +[default7]: iteration 3133/ 3814 | consumed samples: 1604096 | consumed tokens: 3285188608 | elapsed time per iteration (s): 5.45 | learning rate: 9.291E-06 | global batch size: 512 | lm loss: 4.930075E+00 | loss scale: 262144.0 | grad norm: 0.108 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.872 | TFLOPs: 70.90 | +[default7]: iteration 3134/ 3814 | consumed samples: 1604608 | consumed tokens: 3286237184 | elapsed time per iteration (s): 5.46 | learning rate: 9.265E-06 | global batch size: 512 | lm loss: 4.896657E+00 | loss scale: 262144.0 | grad norm: 0.115 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.708 | TFLOPs: 70.78 | +[default7]: iteration 3135/ 3814 | consumed samples: 1605120 | consumed tokens: 3287285760 | elapsed time per iteration (s): 5.45 | learning rate: 9.239E-06 | global batch size: 512 | lm loss: 4.915595E+00 | loss scale: 262144.0 | grad norm: 0.096 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.887 | TFLOPs: 70.91 | +[default7]: iteration 3136/ 3814 | consumed samples: 1605632 | consumed tokens: 3288334336 | elapsed time per iteration (s): 5.45 | learning rate: 9.212E-06 | global batch size: 512 | lm loss: 4.869216E+00 | loss scale: 262144.0 | grad norm: 0.096 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.936 | TFLOPs: 70.95 | +[default7]: iteration 3137/ 3814 | consumed samples: 1606144 | consumed tokens: 3289382912 | elapsed time per iteration (s): 5.44 | learning rate: 9.186E-06 | global batch size: 512 | lm loss: 4.887948E+00 | loss scale: 262144.0 | grad norm: 0.109 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.136 | TFLOPs: 71.10 | +[default7]: iteration 3138/ 3814 | consumed samples: 1606656 | consumed tokens: 3290431488 | elapsed time per iteration (s): 5.44 | learning rate: 9.160E-06 | global batch size: 512 | lm loss: 4.893489E+00 | loss scale: 262144.0 | grad norm: 0.092 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.089 | TFLOPs: 71.07 | +[default7]: iteration 3139/ 3814 | consumed samples: 1607168 | consumed tokens: 3291480064 | elapsed time per iteration (s): 5.46 | learning rate: 9.134E-06 | global batch size: 512 | lm loss: 4.884957E+00 | loss scale: 262144.0 | grad norm: 0.090 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.856 | TFLOPs: 70.89 | +[default7]: iteration 3140/ 3814 | consumed samples: 1607680 | consumed tokens: 3292528640 | elapsed time per iteration (s): 5.44 | learning rate: 9.107E-06 | global batch size: 512 | lm loss: 4.904630E+00 | loss scale: 262144.0 | grad norm: 0.087 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.189 | TFLOPs: 71.14 | +[default7]: iteration 3141/ 3814 | consumed samples: 1608192 | consumed tokens: 3293577216 | elapsed time per iteration (s): 5.43 | learning rate: 9.081E-06 | global batch size: 512 | lm loss: 4.861763E+00 | loss scale: 262144.0 | grad norm: 0.089 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.326 | TFLOPs: 71.25 | +[default7]: iteration 3142/ 3814 | consumed samples: 1608704 | consumed tokens: 3294625792 | elapsed time per iteration (s): 5.44 | learning rate: 9.055E-06 | global batch size: 512 | lm loss: 4.892604E+00 | loss scale: 262144.0 | grad norm: 0.092 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.087 | TFLOPs: 71.07 | +[default7]: iteration 3143/ 3814 | consumed samples: 1609216 | consumed tokens: 3295674368 | elapsed time per iteration (s): 5.43 | learning rate: 9.029E-06 | global batch size: 512 | lm loss: 4.915665E+00 | loss scale: 262144.0 | grad norm: 0.087 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.312 | TFLOPs: 71.24 | +[default7]: iteration 3144/ 3814 | consumed samples: 1609728 | consumed tokens: 3296722944 | elapsed time per iteration (s): 5.44 | learning rate: 9.003E-06 | global batch size: 512 | lm loss: 4.880769E+00 | loss scale: 262144.0 | grad norm: 0.089 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.110 | TFLOPs: 71.08 | +[default7]: iteration 3145/ 3814 | consumed samples: 1610240 | consumed tokens: 3297771520 | elapsed time per iteration (s): 5.46 | learning rate: 8.977E-06 | global batch size: 512 | lm loss: 4.911075E+00 | loss scale: 262144.0 | grad norm: 0.087 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.842 | TFLOPs: 70.88 | +[default7]: iteration 3146/ 3814 | consumed samples: 1610752 | consumed tokens: 3298820096 | elapsed time per iteration (s): 5.44 | learning rate: 8.951E-06 | global batch size: 512 | lm loss: 4.912363E+00 | loss scale: 262144.0 | grad norm: 0.096 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.181 | TFLOPs: 71.14 | +[default7]: iteration 3147/ 3814 | consumed samples: 1611264 | consumed tokens: 3299868672 | elapsed time per iteration (s): 5.43 | learning rate: 8.925E-06 | global batch size: 512 | lm loss: 4.884324E+00 | loss scale: 262144.0 | grad norm: 0.091 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.221 | TFLOPs: 71.17 | +[default7]: iteration 3148/ 3814 | consumed samples: 1611776 | consumed tokens: 3300917248 | elapsed time per iteration (s): 5.45 | learning rate: 8.899E-06 | global batch size: 512 | lm loss: 4.889190E+00 | loss scale: 262144.0 | grad norm: 0.102 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.913 | TFLOPs: 70.93 | +[default7]: iteration 3149/ 3814 | consumed samples: 1612288 | consumed tokens: 3301965824 | elapsed time per iteration (s): 5.44 | learning rate: 8.873E-06 | global batch size: 512 | lm loss: 4.876094E+00 | loss scale: 262144.0 | grad norm: 0.094 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.140 | TFLOPs: 71.10 | +[default7]: iteration 3150/ 3814 | consumed samples: 1612800 | consumed tokens: 3303014400 | elapsed time per iteration (s): 5.44 | learning rate: 8.848E-06 | global batch size: 512 | lm loss: 4.856820E+00 | loss scale: 262144.0 | grad norm: 0.085 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.185 | TFLOPs: 71.14 | +[default7]: iteration 3151/ 3814 | consumed samples: 1613312 | consumed tokens: 3304062976 | elapsed time per iteration (s): 5.41 | learning rate: 8.822E-06 | global batch size: 512 | lm loss: 4.893485E+00 | loss scale: 262144.0 | grad norm: 0.095 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.598 | TFLOPs: 71.45 | +[default7]: iteration 3152/ 3814 | consumed samples: 1613824 | consumed tokens: 3305111552 | elapsed time per iteration (s): 5.41 | learning rate: 8.796E-06 | global batch size: 512 | lm loss: 4.917921E+00 | loss scale: 262144.0 | grad norm: 0.088 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.575 | TFLOPs: 71.43 | +[default7]: iteration 3153/ 3814 | consumed samples: 1614336 | consumed tokens: 3306160128 | elapsed time per iteration (s): 5.43 | learning rate: 8.770E-06 | global batch size: 512 | lm loss: 4.920229E+00 | loss scale: 262144.0 | grad norm: 0.099 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.337 | TFLOPs: 71.25 | +[default7]: iteration 3154/ 3814 | consumed samples: 1614848 | consumed tokens: 3307208704 | elapsed time per iteration (s): 5.43 | learning rate: 8.745E-06 | global batch size: 512 | lm loss: 4.855443E+00 | loss scale: 262144.0 | grad norm: 0.106 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.283 | TFLOPs: 71.21 | +[default7]: iteration 3155/ 3814 | consumed samples: 1615360 | consumed tokens: 3308257280 | elapsed time per iteration (s): 5.43 | learning rate: 8.719E-06 | global batch size: 512 | lm loss: 4.881603E+00 | loss scale: 262144.0 | grad norm: 0.091 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.327 | TFLOPs: 71.25 | +[default7]: iteration 3156/ 3814 | consumed samples: 1615872 | consumed tokens: 3309305856 | elapsed time per iteration (s): 5.47 | learning rate: 8.693E-06 | global batch size: 512 | lm loss: 4.884543E+00 | loss scale: 262144.0 | grad norm: 0.116 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.658 | TFLOPs: 70.74 | +[default7]: iteration 3157/ 3814 | consumed samples: 1616384 | consumed tokens: 3310354432 | elapsed time per iteration (s): 5.43 | learning rate: 8.668E-06 | global batch size: 512 | lm loss: 4.875193E+00 | loss scale: 262144.0 | grad norm: 0.086 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.304 | TFLOPs: 71.23 | +[default7]: iteration 3158/ 3814 | consumed samples: 1616896 | consumed tokens: 3311403008 | elapsed time per iteration (s): 5.43 | learning rate: 8.642E-06 | global batch size: 512 | lm loss: 4.913799E+00 | loss scale: 262144.0 | grad norm: 0.095 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.250 | TFLOPs: 71.19 | +[default7]: iteration 3159/ 3814 | consumed samples: 1617408 | consumed tokens: 3312451584 | elapsed time per iteration (s): 5.43 | learning rate: 8.617E-06 | global batch size: 512 | lm loss: 4.909905E+00 | loss scale: 262144.0 | grad norm: 0.085 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.315 | TFLOPs: 71.24 | +[default7]: iteration 3160/ 3814 | consumed samples: 1617920 | consumed tokens: 3313500160 | elapsed time per iteration (s): 5.43 | learning rate: 8.591E-06 | global batch size: 512 | lm loss: 4.883123E+00 | loss scale: 262144.0 | grad norm: 0.077 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.321 | TFLOPs: 71.24 | +[default7]: iteration 3161/ 3814 | consumed samples: 1618432 | consumed tokens: 3314548736 | elapsed time per iteration (s): 5.44 | learning rate: 8.566E-06 | global batch size: 512 | lm loss: 4.907825E+00 | loss scale: 262144.0 | grad norm: 0.095 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.083 | TFLOPs: 71.06 | +[default7]: iteration 3162/ 3814 | consumed samples: 1618944 | consumed tokens: 3315597312 | elapsed time per iteration (s): 5.45 | learning rate: 8.540E-06 | global batch size: 512 | lm loss: 4.894458E+00 | loss scale: 262144.0 | grad norm: 0.084 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.913 | TFLOPs: 70.93 | +[default7]: iteration 3163/ 3814 | consumed samples: 1619456 | consumed tokens: 3316645888 | elapsed time per iteration (s): 5.48 | learning rate: 8.515E-06 | global batch size: 512 | lm loss: 4.899601E+00 | loss scale: 262144.0 | grad norm: 0.090 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.391 | TFLOPs: 70.54 | +[default7]: iteration 3164/ 3814 | consumed samples: 1619968 | consumed tokens: 3317694464 | elapsed time per iteration (s): 5.42 | learning rate: 8.490E-06 | global batch size: 512 | lm loss: 4.886622E+00 | loss scale: 262144.0 | grad norm: 0.082 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.430 | TFLOPs: 71.32 | +[default7]: iteration 3165/ 3814 | consumed samples: 1620480 | consumed tokens: 3318743040 | elapsed time per iteration (s): 5.46 | learning rate: 8.464E-06 | global batch size: 512 | lm loss: 4.894057E+00 | loss scale: 262144.0 | grad norm: 0.089 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.838 | TFLOPs: 70.88 | +[default7]: iteration 3166/ 3814 | consumed samples: 1620992 | consumed tokens: 3319791616 | elapsed time per iteration (s): 5.46 | learning rate: 8.439E-06 | global batch size: 512 | lm loss: 4.876204E+00 | loss scale: 262144.0 | grad norm: 0.090 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.839 | TFLOPs: 70.88 | +[default7]: iteration 3167/ 3814 | consumed samples: 1621504 | consumed tokens: 3320840192 | elapsed time per iteration (s): 5.47 | learning rate: 8.414E-06 | global batch size: 512 | lm loss: 4.890078E+00 | loss scale: 262144.0 | grad norm: 0.088 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.520 | TFLOPs: 70.64 | +[default7]: iteration 3168/ 3814 | consumed samples: 1622016 | consumed tokens: 3321888768 | elapsed time per iteration (s): 5.46 | learning rate: 8.389E-06 | global batch size: 512 | lm loss: 4.892224E+00 | loss scale: 262144.0 | grad norm: 0.093 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.843 | TFLOPs: 70.88 | +[default7]: iteration 3169/ 3814 | consumed samples: 1622528 | consumed tokens: 3322937344 | elapsed time per iteration (s): 5.44 | learning rate: 8.363E-06 | global batch size: 512 | lm loss: 4.909470E+00 | loss scale: 262144.0 | grad norm: 0.083 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.098 | TFLOPs: 71.07 | +[default7]: iteration 3170/ 3814 | consumed samples: 1623040 | consumed tokens: 3323985920 | elapsed time per iteration (s): 5.42 | learning rate: 8.338E-06 | global batch size: 512 | lm loss: 4.898575E+00 | loss scale: 262144.0 | grad norm: 0.097 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.414 | TFLOPs: 71.31 | +[default7]: iteration 3171/ 3814 | consumed samples: 1623552 | consumed tokens: 3325034496 | elapsed time per iteration (s): 5.46 | learning rate: 8.313E-06 | global batch size: 512 | lm loss: 4.903400E+00 | loss scale: 262144.0 | grad norm: 0.081 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.798 | TFLOPs: 70.85 | +[default7]: iteration 3172/ 3814 | consumed samples: 1624064 | consumed tokens: 3326083072 | elapsed time per iteration (s): 5.44 | learning rate: 8.288E-06 | global batch size: 512 | lm loss: 4.898592E+00 | loss scale: 262144.0 | grad norm: 0.098 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.053 | TFLOPs: 71.04 | +[default7]: iteration 3173/ 3814 | consumed samples: 1624576 | consumed tokens: 3327131648 | elapsed time per iteration (s): 5.44 | learning rate: 8.263E-06 | global batch size: 512 | lm loss: 4.891199E+00 | loss scale: 262144.0 | grad norm: 0.090 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.106 | TFLOPs: 71.08 | +[default7]: iteration 3174/ 3814 | consumed samples: 1625088 | consumed tokens: 3328180224 | elapsed time per iteration (s): 5.44 | learning rate: 8.238E-06 | global batch size: 512 | lm loss: 4.896667E+00 | loss scale: 262144.0 | grad norm: 0.080 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.148 | TFLOPs: 71.11 | +[default7]: iteration 3175/ 3814 | consumed samples: 1625600 | consumed tokens: 3329228800 | elapsed time per iteration (s): 5.42 | learning rate: 8.213E-06 | global batch size: 512 | lm loss: 4.898342E+00 | loss scale: 262144.0 | grad norm: 0.085 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.490 | TFLOPs: 71.37 | +[default7]: iteration 3176/ 3814 | consumed samples: 1626112 | consumed tokens: 3330277376 | elapsed time per iteration (s): 5.44 | learning rate: 8.188E-06 | global batch size: 512 | lm loss: 4.884049E+00 | loss scale: 262144.0 | grad norm: 0.090 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.146 | TFLOPs: 71.11 | +[default7]: iteration 3177/ 3814 | consumed samples: 1626624 | consumed tokens: 3331325952 | elapsed time per iteration (s): 5.44 | learning rate: 8.163E-06 | global batch size: 512 | lm loss: 4.889853E+00 | loss scale: 262144.0 | grad norm: 0.099 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.097 | TFLOPs: 71.07 | +[default7]: iteration 3178/ 3814 | consumed samples: 1627136 | consumed tokens: 3332374528 | elapsed time per iteration (s): 5.44 | learning rate: 8.138E-06 | global batch size: 512 | lm loss: 4.914798E+00 | loss scale: 262144.0 | grad norm: 0.090 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.033 | TFLOPs: 71.02 | +[default7]: iteration 3179/ 3814 | consumed samples: 1627648 | consumed tokens: 3333423104 | elapsed time per iteration (s): 5.45 | learning rate: 8.114E-06 | global batch size: 512 | lm loss: 4.899465E+00 | loss scale: 262144.0 | grad norm: 0.116 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.945 | TFLOPs: 70.96 | +[default7]: iteration 3180/ 3814 | consumed samples: 1628160 | consumed tokens: 3334471680 | elapsed time per iteration (s): 5.45 | learning rate: 8.089E-06 | global batch size: 512 | lm loss: 4.855278E+00 | loss scale: 262144.0 | grad norm: 0.086 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.013 | TFLOPs: 71.01 | +[default7]: iteration 3181/ 3814 | consumed samples: 1628672 | consumed tokens: 3335520256 | elapsed time per iteration (s): 5.43 | learning rate: 8.064E-06 | global batch size: 512 | lm loss: 4.928129E+00 | loss scale: 262144.0 | grad norm: 0.100 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.277 | TFLOPs: 71.21 | +[default7]: iteration 3182/ 3814 | consumed samples: 1629184 | consumed tokens: 3336568832 | elapsed time per iteration (s): 5.45 | learning rate: 8.039E-06 | global batch size: 512 | lm loss: 4.911413E+00 | loss scale: 262144.0 | grad norm: 0.085 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.920 | TFLOPs: 70.94 | +[default7]: iteration 3183/ 3814 | consumed samples: 1629696 | consumed tokens: 3337617408 | elapsed time per iteration (s): 5.46 | learning rate: 8.015E-06 | global batch size: 512 | lm loss: 4.888097E+00 | loss scale: 262144.0 | grad norm: 0.086 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.815 | TFLOPs: 70.86 | +[default7]: iteration 3184/ 3814 | consumed samples: 1630208 | consumed tokens: 3338665984 | elapsed time per iteration (s): 5.47 | learning rate: 7.990E-06 | global batch size: 512 | lm loss: 4.896648E+00 | loss scale: 262144.0 | grad norm: 0.087 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.686 | TFLOPs: 70.76 | +[default7]: iteration 3185/ 3814 | consumed samples: 1630720 | consumed tokens: 3339714560 | elapsed time per iteration (s): 5.47 | learning rate: 7.965E-06 | global batch size: 512 | lm loss: 4.895738E+00 | loss scale: 262144.0 | grad norm: 0.083 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.644 | TFLOPs: 70.73 | +[default7]: iteration 3186/ 3814 | consumed samples: 1631232 | consumed tokens: 3340763136 | elapsed time per iteration (s): 5.45 | learning rate: 7.941E-06 | global batch size: 512 | lm loss: 4.911983E+00 | loss scale: 262144.0 | grad norm: 0.106 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.876 | TFLOPs: 70.91 | +[default7]: iteration 3187/ 3814 | consumed samples: 1631744 | consumed tokens: 3341811712 | elapsed time per iteration (s): 5.43 | learning rate: 7.916E-06 | global batch size: 512 | lm loss: 4.873192E+00 | loss scale: 262144.0 | grad norm: 0.077 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.223 | TFLOPs: 71.17 | +[default7]: iteration 3188/ 3814 | consumed samples: 1632256 | consumed tokens: 3342860288 | elapsed time per iteration (s): 5.43 | learning rate: 7.892E-06 | global batch size: 512 | lm loss: 4.877746E+00 | loss scale: 262144.0 | grad norm: 0.113 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.263 | TFLOPs: 71.20 | +[default7]: iteration 3189/ 3814 | consumed samples: 1632768 | consumed tokens: 3343908864 | elapsed time per iteration (s): 5.42 | learning rate: 7.867E-06 | global batch size: 512 | lm loss: 4.885969E+00 | loss scale: 262144.0 | grad norm: 0.082 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.526 | TFLOPs: 71.40 | +[default7]: iteration 3190/ 3814 | consumed samples: 1633280 | consumed tokens: 3344957440 | elapsed time per iteration (s): 5.45 | learning rate: 7.843E-06 | global batch size: 512 | lm loss: 4.906846E+00 | loss scale: 262144.0 | grad norm: 0.095 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.030 | TFLOPs: 71.02 | +[default7]: iteration 3191/ 3814 | consumed samples: 1633792 | consumed tokens: 3346006016 | elapsed time per iteration (s): 5.44 | learning rate: 7.818E-06 | global batch size: 512 | lm loss: 4.891609E+00 | loss scale: 262144.0 | grad norm: 0.086 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.191 | TFLOPs: 71.14 | +[default7]: iteration 3192/ 3814 | consumed samples: 1634304 | consumed tokens: 3347054592 | elapsed time per iteration (s): 5.44 | learning rate: 7.794E-06 | global batch size: 512 | lm loss: 4.880786E+00 | loss scale: 262144.0 | grad norm: 0.087 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.148 | TFLOPs: 71.11 | +[default7]: iteration 3193/ 3814 | consumed samples: 1634816 | consumed tokens: 3348103168 | elapsed time per iteration (s): 5.44 | learning rate: 7.770E-06 | global batch size: 512 | lm loss: 4.886089E+00 | loss scale: 262144.0 | grad norm: 0.095 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.124 | TFLOPs: 71.09 | +[default7]: iteration 3194/ 3814 | consumed samples: 1635328 | consumed tokens: 3349151744 | elapsed time per iteration (s): 5.43 | learning rate: 7.745E-06 | global batch size: 512 | lm loss: 4.881205E+00 | loss scale: 262144.0 | grad norm: 0.083 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.377 | TFLOPs: 71.28 | +[default7]: iteration 3195/ 3814 | consumed samples: 1635840 | consumed tokens: 3350200320 | elapsed time per iteration (s): 5.48 | learning rate: 7.721E-06 | global batch size: 512 | lm loss: 4.899330E+00 | loss scale: 262144.0 | grad norm: 0.094 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.358 | TFLOPs: 70.51 | +[default7]: iteration 3196/ 3814 | consumed samples: 1636352 | consumed tokens: 3351248896 | elapsed time per iteration (s): 5.44 | learning rate: 7.697E-06 | global batch size: 512 | lm loss: 4.868029E+00 | loss scale: 262144.0 | grad norm: 0.081 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.130 | TFLOPs: 71.10 | +[default7]: iteration 3197/ 3814 | consumed samples: 1636864 | consumed tokens: 3352297472 | elapsed time per iteration (s): 5.45 | learning rate: 7.673E-06 | global batch size: 512 | lm loss: 4.892441E+00 | loss scale: 262144.0 | grad norm: 0.085 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.967 | TFLOPs: 70.97 | +[default7]: iteration 3198/ 3814 | consumed samples: 1637376 | consumed tokens: 3353346048 | elapsed time per iteration (s): 5.43 | learning rate: 7.648E-06 | global batch size: 512 | lm loss: 4.892908E+00 | loss scale: 262144.0 | grad norm: 0.093 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.283 | TFLOPs: 71.21 | +[default7]: iteration 3199/ 3814 | consumed samples: 1637888 | consumed tokens: 3354394624 | elapsed time per iteration (s): 5.45 | learning rate: 7.624E-06 | global batch size: 512 | lm loss: 4.889613E+00 | loss scale: 262144.0 | grad norm: 0.084 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.948 | TFLOPs: 70.96 | +[default7]: iteration 3200/ 3814 | consumed samples: 1638400 | consumed tokens: 3355443200 | elapsed time per iteration (s): 5.47 | learning rate: 7.600E-06 | global batch size: 512 | lm loss: 4.893366E+00 | loss scale: 262144.0 | grad norm: 0.092 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.682 | TFLOPs: 70.76 | +[default7]: iteration 3201/ 3814 | consumed samples: 1638912 | consumed tokens: 3356491776 | elapsed time per iteration (s): 5.45 | learning rate: 7.576E-06 | global batch size: 512 | lm loss: 4.902358E+00 | loss scale: 262144.0 | grad norm: 0.085 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.026 | TFLOPs: 71.02 | +[default7]: iteration 3202/ 3814 | consumed samples: 1639424 | consumed tokens: 3357540352 | elapsed time per iteration (s): 5.45 | learning rate: 7.552E-06 | global batch size: 512 | lm loss: 4.905367E+00 | loss scale: 262144.0 | grad norm: 0.088 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.884 | TFLOPs: 70.91 | +[default7]: iteration 3203/ 3814 | consumed samples: 1639936 | consumed tokens: 3358588928 | elapsed time per iteration (s): 5.45 | learning rate: 7.528E-06 | global batch size: 512 | lm loss: 4.870996E+00 | loss scale: 262144.0 | grad norm: 0.081 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.939 | TFLOPs: 70.95 | +[default7]: iteration 3204/ 3814 | consumed samples: 1640448 | consumed tokens: 3359637504 | elapsed time per iteration (s): 5.44 | learning rate: 7.504E-06 | global batch size: 512 | lm loss: 4.899618E+00 | loss scale: 262144.0 | grad norm: 0.085 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.120 | TFLOPs: 71.09 | +[default7]: iteration 3205/ 3814 | consumed samples: 1640960 | consumed tokens: 3360686080 | elapsed time per iteration (s): 5.44 | learning rate: 7.480E-06 | global batch size: 512 | lm loss: 4.904160E+00 | loss scale: 262144.0 | grad norm: 0.079 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.119 | TFLOPs: 71.09 | +[default7]: iteration 3206/ 3814 | consumed samples: 1641472 | consumed tokens: 3361734656 | elapsed time per iteration (s): 5.39 | learning rate: 7.456E-06 | global batch size: 512 | lm loss: 4.890297E+00 | loss scale: 262144.0 | grad norm: 0.079 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.970 | TFLOPs: 71.73 | +[default7]: iteration 3207/ 3814 | consumed samples: 1641984 | consumed tokens: 3362783232 | elapsed time per iteration (s): 5.46 | learning rate: 7.432E-06 | global batch size: 512 | lm loss: 4.872692E+00 | loss scale: 262144.0 | grad norm: 0.094 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.719 | TFLOPs: 70.79 | +[default7]: iteration 3208/ 3814 | consumed samples: 1642496 | consumed tokens: 3363831808 | elapsed time per iteration (s): 5.43 | learning rate: 7.409E-06 | global batch size: 512 | lm loss: 4.873666E+00 | loss scale: 262144.0 | grad norm: 0.078 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.218 | TFLOPs: 71.16 | +[default7]: iteration 3209/ 3814 | consumed samples: 1643008 | consumed tokens: 3364880384 | elapsed time per iteration (s): 5.48 | learning rate: 7.385E-06 | global batch size: 512 | lm loss: 4.907383E+00 | loss scale: 262144.0 | grad norm: 0.086 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.415 | TFLOPs: 70.56 | +[default7]: iteration 3210/ 3814 | consumed samples: 1643520 | consumed tokens: 3365928960 | elapsed time per iteration (s): 5.40 | learning rate: 7.361E-06 | global batch size: 512 | lm loss: 4.874053E+00 | loss scale: 262144.0 | grad norm: 0.093 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.838 | TFLOPs: 71.63 | +[default7]: iteration 3211/ 3814 | consumed samples: 1644032 | consumed tokens: 3366977536 | elapsed time per iteration (s): 5.44 | learning rate: 7.337E-06 | global batch size: 512 | lm loss: 4.894757E+00 | loss scale: 262144.0 | grad norm: 0.096 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.043 | TFLOPs: 71.03 | +[default7]: iteration 3212/ 3814 | consumed samples: 1644544 | consumed tokens: 3368026112 | elapsed time per iteration (s): 5.46 | learning rate: 7.314E-06 | global batch size: 512 | lm loss: 4.897040E+00 | loss scale: 262144.0 | grad norm: 0.095 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.839 | TFLOPs: 70.88 | +[default7]: iteration 3213/ 3814 | consumed samples: 1645056 | consumed tokens: 3369074688 | elapsed time per iteration (s): 5.40 | learning rate: 7.290E-06 | global batch size: 512 | lm loss: 4.898105E+00 | loss scale: 262144.0 | grad norm: 0.083 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.818 | TFLOPs: 71.62 | +[default7]: iteration 3214/ 3814 | consumed samples: 1645568 | consumed tokens: 3370123264 | elapsed time per iteration (s): 5.49 | learning rate: 7.266E-06 | global batch size: 512 | lm loss: 4.883702E+00 | loss scale: 262144.0 | grad norm: 0.092 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.268 | TFLOPs: 70.45 | +[default7]: iteration 3215/ 3814 | consumed samples: 1646080 | consumed tokens: 3371171840 | elapsed time per iteration (s): 5.46 | learning rate: 7.243E-06 | global batch size: 512 | lm loss: 4.900812E+00 | loss scale: 262144.0 | grad norm: 0.093 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.804 | TFLOPs: 70.85 | +[default7]: iteration 3216/ 3814 | consumed samples: 1646592 | consumed tokens: 3372220416 | elapsed time per iteration (s): 5.44 | learning rate: 7.219E-06 | global batch size: 512 | lm loss: 4.895722E+00 | loss scale: 262144.0 | grad norm: 0.094 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.179 | TFLOPs: 71.13 | +[default7]: iteration 3217/ 3814 | consumed samples: 1647104 | consumed tokens: 3373268992 | elapsed time per iteration (s): 5.45 | learning rate: 7.196E-06 | global batch size: 512 | lm loss: 4.896819E+00 | loss scale: 262144.0 | grad norm: 0.097 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.872 | TFLOPs: 70.90 | +[default7]: iteration 3218/ 3814 | consumed samples: 1647616 | consumed tokens: 3374317568 | elapsed time per iteration (s): 5.45 | learning rate: 7.172E-06 | global batch size: 512 | lm loss: 4.893477E+00 | loss scale: 262144.0 | grad norm: 0.080 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.908 | TFLOPs: 70.93 | +[default7]: iteration 3219/ 3814 | consumed samples: 1648128 | consumed tokens: 3375366144 | elapsed time per iteration (s): 5.45 | learning rate: 7.149E-06 | global batch size: 512 | lm loss: 4.896156E+00 | loss scale: 262144.0 | grad norm: 0.111 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.948 | TFLOPs: 70.96 | +[default7]: iteration 3220/ 3814 | consumed samples: 1648640 | consumed tokens: 3376414720 | elapsed time per iteration (s): 5.44 | learning rate: 7.126E-06 | global batch size: 512 | lm loss: 4.877630E+00 | loss scale: 262144.0 | grad norm: 0.085 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.111 | TFLOPs: 71.08 | +[default7]: iteration 3221/ 3814 | consumed samples: 1649152 | consumed tokens: 3377463296 | elapsed time per iteration (s): 5.46 | learning rate: 7.102E-06 | global batch size: 512 | lm loss: 4.883619E+00 | loss scale: 262144.0 | grad norm: 0.110 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.744 | TFLOPs: 70.81 | +[default7]: iteration 3222/ 3814 | consumed samples: 1649664 | consumed tokens: 3378511872 | elapsed time per iteration (s): 5.46 | learning rate: 7.079E-06 | global batch size: 512 | lm loss: 4.873229E+00 | loss scale: 262144.0 | grad norm: 0.108 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.785 | TFLOPs: 70.84 | +[default7]: iteration 3223/ 3814 | consumed samples: 1650176 | consumed tokens: 3379560448 | elapsed time per iteration (s): 5.45 | learning rate: 7.056E-06 | global batch size: 512 | lm loss: 4.901267E+00 | loss scale: 262144.0 | grad norm: 0.089 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.892 | TFLOPs: 70.92 | +[default7]: iteration 3224/ 3814 | consumed samples: 1650688 | consumed tokens: 3380609024 | elapsed time per iteration (s): 5.45 | learning rate: 7.032E-06 | global batch size: 512 | lm loss: 4.893874E+00 | loss scale: 262144.0 | grad norm: 0.116 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.971 | TFLOPs: 70.98 | +[default7]: iteration 3225/ 3814 | consumed samples: 1651200 | consumed tokens: 3381657600 | elapsed time per iteration (s): 5.45 | learning rate: 7.009E-06 | global batch size: 512 | lm loss: 4.890083E+00 | loss scale: 262144.0 | grad norm: 0.087 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.984 | TFLOPs: 70.99 | +[default7]: iteration 3226/ 3814 | consumed samples: 1651712 | consumed tokens: 3382706176 | elapsed time per iteration (s): 5.44 | learning rate: 6.986E-06 | global batch size: 512 | lm loss: 4.889584E+00 | loss scale: 262144.0 | grad norm: 0.092 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.194 | TFLOPs: 71.15 | +[default7]: iteration 3227/ 3814 | consumed samples: 1652224 | consumed tokens: 3383754752 | elapsed time per iteration (s): 5.46 | learning rate: 6.963E-06 | global batch size: 512 | lm loss: 4.864236E+00 | loss scale: 262144.0 | grad norm: 0.095 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.838 | TFLOPs: 70.88 | +[default7]: iteration 3228/ 3814 | consumed samples: 1652736 | consumed tokens: 3384803328 | elapsed time per iteration (s): 5.45 | learning rate: 6.940E-06 | global batch size: 512 | lm loss: 4.873882E+00 | loss scale: 262144.0 | grad norm: 0.087 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.957 | TFLOPs: 70.97 | +[default7]: iteration 3229/ 3814 | consumed samples: 1653248 | consumed tokens: 3385851904 | elapsed time per iteration (s): 5.45 | learning rate: 6.917E-06 | global batch size: 512 | lm loss: 4.883638E+00 | loss scale: 262144.0 | grad norm: 0.093 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.919 | TFLOPs: 70.94 | +[default7]: iteration 3230/ 3814 | consumed samples: 1653760 | consumed tokens: 3386900480 | elapsed time per iteration (s): 5.44 | learning rate: 6.894E-06 | global batch size: 512 | lm loss: 4.877623E+00 | loss scale: 262144.0 | grad norm: 0.085 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.117 | TFLOPs: 71.09 | +[default7]: iteration 3231/ 3814 | consumed samples: 1654272 | consumed tokens: 3387949056 | elapsed time per iteration (s): 5.47 | learning rate: 6.871E-06 | global batch size: 512 | lm loss: 4.847892E+00 | loss scale: 262144.0 | grad norm: 0.088 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.607 | TFLOPs: 70.70 | +[default7]: iteration 3232/ 3814 | consumed samples: 1654784 | consumed tokens: 3388997632 | elapsed time per iteration (s): 5.45 | learning rate: 6.848E-06 | global batch size: 512 | lm loss: 4.891341E+00 | loss scale: 262144.0 | grad norm: 0.084 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.966 | TFLOPs: 70.97 | +[default7]: iteration 3233/ 3814 | consumed samples: 1655296 | consumed tokens: 3390046208 | elapsed time per iteration (s): 5.46 | learning rate: 6.825E-06 | global batch size: 512 | lm loss: 4.879620E+00 | loss scale: 262144.0 | grad norm: 0.085 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.835 | TFLOPs: 70.87 | +[default7]: iteration 3234/ 3814 | consumed samples: 1655808 | consumed tokens: 3391094784 | elapsed time per iteration (s): 5.44 | learning rate: 6.802E-06 | global batch size: 512 | lm loss: 4.874743E+00 | loss scale: 262144.0 | grad norm: 0.084 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.061 | TFLOPs: 71.05 | +[default7]: iteration 3235/ 3814 | consumed samples: 1656320 | consumed tokens: 3392143360 | elapsed time per iteration (s): 5.45 | learning rate: 6.779E-06 | global batch size: 512 | lm loss: 4.873177E+00 | loss scale: 262144.0 | grad norm: 0.112 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.016 | TFLOPs: 71.01 | +[default7]: iteration 3236/ 3814 | consumed samples: 1656832 | consumed tokens: 3393191936 | elapsed time per iteration (s): 5.47 | learning rate: 6.756E-06 | global batch size: 512 | lm loss: 4.899149E+00 | loss scale: 262144.0 | grad norm: 0.082 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.654 | TFLOPs: 70.74 | +[default7]: iteration 3237/ 3814 | consumed samples: 1657344 | consumed tokens: 3394240512 | elapsed time per iteration (s): 5.47 | learning rate: 6.733E-06 | global batch size: 512 | lm loss: 4.891674E+00 | loss scale: 262144.0 | grad norm: 0.085 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.682 | TFLOPs: 70.76 | +[default7]: iteration 3238/ 3814 | consumed samples: 1657856 | consumed tokens: 3395289088 | elapsed time per iteration (s): 5.45 | learning rate: 6.711E-06 | global batch size: 512 | lm loss: 4.894403E+00 | loss scale: 262144.0 | grad norm: 0.083 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.873 | TFLOPs: 70.90 | +[default7]: iteration 3239/ 3814 | consumed samples: 1658368 | consumed tokens: 3396337664 | elapsed time per iteration (s): 5.42 | learning rate: 6.688E-06 | global batch size: 512 | lm loss: 4.879573E+00 | loss scale: 262144.0 | grad norm: 0.081 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.391 | TFLOPs: 71.29 | +[default7]: iteration 3240/ 3814 | consumed samples: 1658880 | consumed tokens: 3397386240 | elapsed time per iteration (s): 5.43 | learning rate: 6.665E-06 | global batch size: 512 | lm loss: 4.889745E+00 | loss scale: 262144.0 | grad norm: 0.080 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.236 | TFLOPs: 71.18 | +[default7]: iteration 3241/ 3814 | consumed samples: 1659392 | consumed tokens: 3398434816 | elapsed time per iteration (s): 5.43 | learning rate: 6.642E-06 | global batch size: 512 | lm loss: 4.861831E+00 | loss scale: 262144.0 | grad norm: 0.085 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.293 | TFLOPs: 71.22 | +[default7]: iteration 3242/ 3814 | consumed samples: 1659904 | consumed tokens: 3399483392 | elapsed time per iteration (s): 5.43 | learning rate: 6.620E-06 | global batch size: 512 | lm loss: 4.867723E+00 | loss scale: 262144.0 | grad norm: 0.081 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.208 | TFLOPs: 71.16 | +[default7]: iteration 3243/ 3814 | consumed samples: 1660416 | consumed tokens: 3400531968 | elapsed time per iteration (s): 5.44 | learning rate: 6.597E-06 | global batch size: 512 | lm loss: 4.881914E+00 | loss scale: 262144.0 | grad norm: 0.083 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.193 | TFLOPs: 71.14 | +[default7]: iteration 3244/ 3814 | consumed samples: 1660928 | consumed tokens: 3401580544 | elapsed time per iteration (s): 5.41 | learning rate: 6.575E-06 | global batch size: 512 | lm loss: 4.881909E+00 | loss scale: 262144.0 | grad norm: 0.083 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.698 | TFLOPs: 71.53 | +[default7]: iteration 3245/ 3814 | consumed samples: 1661440 | consumed tokens: 3402629120 | elapsed time per iteration (s): 5.42 | learning rate: 6.552E-06 | global batch size: 512 | lm loss: 4.861357E+00 | loss scale: 262144.0 | grad norm: 0.079 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.552 | TFLOPs: 71.42 | +[default7]: iteration 3246/ 3814 | consumed samples: 1661952 | consumed tokens: 3403677696 | elapsed time per iteration (s): 5.42 | learning rate: 6.530E-06 | global batch size: 512 | lm loss: 4.887056E+00 | loss scale: 262144.0 | grad norm: 0.088 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.401 | TFLOPs: 71.30 | +[default7]: iteration 3247/ 3814 | consumed samples: 1662464 | consumed tokens: 3404726272 | elapsed time per iteration (s): 5.43 | learning rate: 6.507E-06 | global batch size: 512 | lm loss: 4.876287E+00 | loss scale: 262144.0 | grad norm: 0.077 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.304 | TFLOPs: 71.23 | +[default7]: iteration 3248/ 3814 | consumed samples: 1662976 | consumed tokens: 3405774848 | elapsed time per iteration (s): 5.42 | learning rate: 6.485E-06 | global batch size: 512 | lm loss: 4.886286E+00 | loss scale: 262144.0 | grad norm: 0.090 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.484 | TFLOPs: 71.36 | +[default7]: iteration 3249/ 3814 | consumed samples: 1663488 | consumed tokens: 3406823424 | elapsed time per iteration (s): 5.44 | learning rate: 6.463E-06 | global batch size: 512 | lm loss: 4.881217E+00 | loss scale: 262144.0 | grad norm: 0.076 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.103 | TFLOPs: 71.08 | +[default0]:saving checkpoint at iteration 3250 to /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main +[default0]:[2023-02-16 17:28:01,680] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step3250 is begin to save! +[default0]:[2023-02-16 17:28:01,683] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/layer_01-model_00-model_states.pt... +[default7]: iteration 3250/ 3814 | consumed samples: 1664000 | consumed tokens: 3407872000 | elapsed time per iteration (s): 5.45 | learning rate: 6.440E-06 | global batch size: 512 | lm loss: 4.874021E+00 | loss scale: 262144.0 | grad norm: 0.091 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.936 | TFLOPs: 70.95 | +[default1]:[2023-02-16 17:28:01,683] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/layer_01-model_01-model_states.pt... +[default1]:[2023-02-16 17:28:01,847] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/layer_01-model_01-model_states.pt. +[default1]:[2023-02-16 17:28:01,847] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/layer_04-model_01-model_states.pt... +[default0]:[2023-02-16 17:28:01,846] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/layer_01-model_00-model_states.pt. +[default0]:[2023-02-16 17:28:01,847] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/layer_04-model_00-model_states.pt... +[default0]:[2023-02-16 17:28:01,921] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/layer_04-model_00-model_states.pt. +[default0]:[2023-02-16 17:28:01,922] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/layer_05-model_00-model_states.pt... +[default1]:[2023-02-16 17:28:01,920] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/layer_04-model_01-model_states.pt. +[default1]:[2023-02-16 17:28:01,921] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/layer_05-model_01-model_states.pt... +[default0]:[2023-02-16 17:28:01,984] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/layer_05-model_00-model_states.pt. +[default0]:[2023-02-16 17:28:01,985] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/layer_06-model_00-model_states.pt... +[default1]:[2023-02-16 17:28:01,982] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/layer_05-model_01-model_states.pt. +[default1]:[2023-02-16 17:28:01,983] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/layer_06-model_01-model_states.pt... +[default1]:[2023-02-16 17:28:02,040] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/layer_06-model_01-model_states.pt. +[default1]:[2023-02-16 17:28:02,040] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/layer_07-model_01-model_states.pt... +[default0]:[2023-02-16 17:28:02,042] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/layer_06-model_00-model_states.pt. +[default0]:[2023-02-16 17:28:02,043] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/layer_07-model_00-model_states.pt... +[default0]:[2023-02-16 17:28:02,101] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/layer_07-model_00-model_states.pt. +[default0]:[2023-02-16 17:28:02,102] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/layer_08-model_00-model_states.pt... +[default1]:[2023-02-16 17:28:02,096] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/layer_07-model_01-model_states.pt. +[default1]:[2023-02-16 17:28:02,097] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/layer_08-model_01-model_states.pt... +[default1]:[2023-02-16 17:28:02,155] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/layer_08-model_01-model_states.pt. +[default1]:[2023-02-16 17:28:02,156] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/layer_09-model_01-model_states.pt... +[default0]:[2023-02-16 17:28:02,159] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/layer_08-model_00-model_states.pt. +[default0]:[2023-02-16 17:28:02,160] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/layer_09-model_00-model_states.pt... +[default0]:[2023-02-16 17:28:02,218] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/layer_09-model_00-model_states.pt. +[default0]:[2023-02-16 17:28:02,218] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/layer_10-model_00-model_states.pt... +[default1]:[2023-02-16 17:28:02,213] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/layer_09-model_01-model_states.pt. +[default1]:[2023-02-16 17:28:02,213] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/layer_10-model_01-model_states.pt... +[default1]:[2023-02-16 17:28:02,270] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/layer_10-model_01-model_states.pt. +[default1]:[2023-02-16 17:28:02,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/layer_11-model_01-model_states.pt... +[default0]:[2023-02-16 17:28:02,274] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/layer_10-model_00-model_states.pt. +[default0]:[2023-02-16 17:28:02,275] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/layer_11-model_00-model_states.pt... +[default1]:[2023-02-16 17:28:02,328] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/layer_11-model_01-model_states.pt. +[default1]:[2023-02-16 17:28:02,329] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/layer_12-model_01-model_states.pt... +[default0]:[2023-02-16 17:28:02,333] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/layer_11-model_00-model_states.pt. +[default0]:[2023-02-16 17:28:02,334] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/layer_12-model_00-model_states.pt... +[default0]:[2023-02-16 17:28:02,392] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/layer_12-model_00-model_states.pt. +[default0]:[2023-02-16 17:28:02,392] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/layer_13-model_00-model_states.pt... +[default1]:[2023-02-16 17:28:02,384] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/layer_12-model_01-model_states.pt. +[default1]:[2023-02-16 17:28:02,385] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/layer_13-model_01-model_states.pt... +[default1]:[2023-02-16 17:28:02,440] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/layer_13-model_01-model_states.pt. +[default1]:[2023-02-16 17:28:02,440] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/layer_14-model_01-model_states.pt... +[default0]:[2023-02-16 17:28:02,448] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/layer_13-model_00-model_states.pt. +[default0]:[2023-02-16 17:28:02,449] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/layer_14-model_00-model_states.pt... +[default0]:[2023-02-16 17:28:02,504] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/layer_14-model_00-model_states.pt. +[default0]:[2023-02-16 17:28:02,505] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/layer_15-model_00-model_states.pt... +[default1]:[2023-02-16 17:28:02,496] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/layer_14-model_01-model_states.pt. +[default1]:[2023-02-16 17:28:02,496] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/layer_15-model_01-model_states.pt... +[default1]:[2023-02-16 17:28:02,550] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/layer_15-model_01-model_states.pt. +[default1]:[2023-02-16 17:28:02,551] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/layer_16-model_01-model_states.pt... +[default0]:[2023-02-16 17:28:02,561] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/layer_15-model_00-model_states.pt. +[default0]:[2023-02-16 17:28:02,562] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/layer_16-model_00-model_states.pt... +[default0]:[2023-02-16 17:28:02,619] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/layer_16-model_00-model_states.pt. +[default0]:[2023-02-16 17:28:02,619] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/layer_17-model_00-model_states.pt... +[default1]:[2023-02-16 17:28:02,609] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/layer_16-model_01-model_states.pt. +[default1]:[2023-02-16 17:28:02,610] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/layer_17-model_01-model_states.pt... +[default1]:[2023-02-16 17:28:02,666] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/layer_17-model_01-model_states.pt. +[default1]:[2023-02-16 17:28:02,667] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/layer_18-model_01-model_states.pt... +[default0]:[2023-02-16 17:28:02,674] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/layer_17-model_00-model_states.pt. +[default0]:[2023-02-16 17:28:02,675] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/layer_18-model_00-model_states.pt... +[default1]:[2023-02-16 17:28:02,723] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/layer_18-model_01-model_states.pt. +[default1]:[2023-02-16 17:28:02,724] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/layer_19-model_01-model_states.pt... +[default0]:[2023-02-16 17:28:02,731] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/layer_18-model_00-model_states.pt. +[default0]:[2023-02-16 17:28:02,731] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/layer_19-model_00-model_states.pt... +[default0]:[2023-02-16 17:28:02,787] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/layer_19-model_00-model_states.pt. +[default0]:[2023-02-16 17:28:02,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/layer_20-model_00-model_states.pt... +[default1]:[2023-02-16 17:28:02,779] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/layer_19-model_01-model_states.pt. +[default1]:[2023-02-16 17:28:02,780] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/layer_20-model_01-model_states.pt... +[default1]:[2023-02-16 17:28:02,835] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/layer_20-model_01-model_states.pt. +[default1]:[2023-02-16 17:28:02,835] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/layer_21-model_01-model_states.pt... +[default0]:[2023-02-16 17:28:02,843] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/layer_20-model_00-model_states.pt. +[default0]:[2023-02-16 17:28:02,843] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/layer_21-model_00-model_states.pt... +[default0]:[2023-02-16 17:28:02,899] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/layer_21-model_00-model_states.pt. +[default0]:[2023-02-16 17:28:02,900] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/layer_22-model_00-model_states.pt... +[default1]:[2023-02-16 17:28:02,893] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/layer_21-model_01-model_states.pt. +[default1]:[2023-02-16 17:28:02,893] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/layer_22-model_01-model_states.pt... +[default1]:[2023-02-16 17:28:02,948] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/layer_22-model_01-model_states.pt. +[default1]:[2023-02-16 17:28:02,949] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/layer_23-model_01-model_states.pt... +[default0]:[2023-02-16 17:28:02,955] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/layer_22-model_00-model_states.pt. +[default0]:[2023-02-16 17:28:02,955] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/layer_23-model_00-model_states.pt... +[default0]:[2023-02-16 17:28:03,010] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/layer_23-model_00-model_states.pt. +[default0]:[2023-02-16 17:28:03,010] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/layer_24-model_00-model_states.pt... +[default1]:[2023-02-16 17:28:03,004] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/layer_23-model_01-model_states.pt. +[default1]:[2023-02-16 17:28:03,005] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/layer_24-model_01-model_states.pt... +[default1]:[2023-02-16 17:28:03,064] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/layer_24-model_01-model_states.pt. +[default1]:[2023-02-16 17:28:03,064] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/layer_25-model_01-model_states.pt... +[default0]:[2023-02-16 17:28:03,069] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/layer_24-model_00-model_states.pt. +[default0]:[2023-02-16 17:28:03,069] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/layer_25-model_00-model_states.pt... +[default1]:[2023-02-16 17:28:03,122] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/layer_25-model_01-model_states.pt. +[default1]:[2023-02-16 17:28:03,122] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/layer_26-model_01-model_states.pt... +[default0]:[2023-02-16 17:28:03,127] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/layer_25-model_00-model_states.pt. +[default0]:[2023-02-16 17:28:03,127] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/layer_26-model_00-model_states.pt... +[default0]:[2023-02-16 17:28:03,183] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/layer_26-model_00-model_states.pt. +[default0]:[2023-02-16 17:28:03,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/layer_27-model_00-model_states.pt... +[default1]:[2023-02-16 17:28:03,177] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/layer_26-model_01-model_states.pt. +[default1]:[2023-02-16 17:28:03,178] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/layer_27-model_01-model_states.pt... +[default1]:[2023-02-16 17:28:03,235] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/layer_27-model_01-model_states.pt. +[default1]:[2023-02-16 17:28:03,236] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/layer_28-model_01-model_states.pt... +[default0]:[2023-02-16 17:28:03,240] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/layer_27-model_00-model_states.pt. +[default0]:[2023-02-16 17:28:03,241] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/layer_28-model_00-model_states.pt... +[default0]:[2023-02-16 17:28:03,298] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/layer_28-model_00-model_states.pt. +[default0]:[2023-02-16 17:28:03,299] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/layer_29-model_00-model_states.pt... +[default1]:[2023-02-16 17:28:03,293] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/layer_28-model_01-model_states.pt. +[default1]:[2023-02-16 17:28:03,294] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/layer_29-model_01-model_states.pt... +[default1]:[2023-02-16 17:28:03,352] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/layer_29-model_01-model_states.pt. +[default1]:[2023-02-16 17:28:03,352] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/layer_30-model_01-model_states.pt... +[default0]:[2023-02-16 17:28:03,358] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/layer_29-model_00-model_states.pt. +[default0]:[2023-02-16 17:28:03,359] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/layer_30-model_00-model_states.pt... +[default0]:[2023-02-16 17:28:03,414] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/layer_30-model_00-model_states.pt. +[default0]:[2023-02-16 17:28:03,415] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/layer_31-model_00-model_states.pt... +[default1]:[2023-02-16 17:28:03,407] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/layer_30-model_01-model_states.pt. +[default1]:[2023-02-16 17:28:03,407] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/layer_31-model_01-model_states.pt... +[default1]:[2023-02-16 17:28:03,470] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/layer_31-model_01-model_states.pt. +[default1]:[2023-02-16 17:28:03,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/layer_32-model_01-model_states.pt... +[default0]:[2023-02-16 17:28:03,472] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/layer_31-model_00-model_states.pt. +[default0]:[2023-02-16 17:28:03,473] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/layer_32-model_00-model_states.pt... +[default1]:[2023-02-16 17:28:03,527] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/layer_32-model_01-model_states.pt. +[default1]:[2023-02-16 17:28:03,528] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/layer_33-model_01-model_states.pt... +[default0]:[2023-02-16 17:28:03,532] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/layer_32-model_00-model_states.pt. +[default0]:[2023-02-16 17:28:03,533] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/layer_33-model_00-model_states.pt... +[default0]:[2023-02-16 17:28:03,591] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/layer_33-model_00-model_states.pt. +[default0]:[2023-02-16 17:28:03,591] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/layer_34-model_00-model_states.pt... +[default1]:[2023-02-16 17:28:03,586] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/layer_33-model_01-model_states.pt. +[default1]:[2023-02-16 17:28:03,587] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/layer_34-model_01-model_states.pt... +[default1]:[2023-02-16 17:28:03,644] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/layer_34-model_01-model_states.pt. +[default1]:[2023-02-16 17:28:03,645] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/layer_35-model_01-model_states.pt... +[default0]:[2023-02-16 17:28:03,650] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/layer_34-model_00-model_states.pt. +[default0]:[2023-02-16 17:28:03,650] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/layer_35-model_00-model_states.pt... +[default0]:[2023-02-16 17:28:03,707] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/layer_35-model_00-model_states.pt. +[default0]:[2023-02-16 17:28:03,707] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/layer_37-model_00-model_states.pt... +[default0]:[2023-02-16 17:28:03,734] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/zero_pp_rank_20_mp_rank_00_optim_states.pt... +[default2]:[2023-02-16 17:28:03,734] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/zero_pp_rank_21_mp_rank_00_optim_states.pt... +[default1]:[2023-02-16 17:28:03,734] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/zero_pp_rank_28_mp_rank_01_optim_states.pt... +[default7]:[2023-02-16 17:28:03,734] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/zero_pp_rank_27_mp_rank_01_optim_states.pt... +[default6]:[2023-02-16 17:28:03,734] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/zero_pp_rank_15_mp_rank_00_optim_states.pt... +[default7]:[2023-02-16 17:28:03,734] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/zero_pp_rank_7_mp_rank_01_optim_states.pt... +[default7]:[2023-02-16 17:28:03,734] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/zero_pp_rank_23_mp_rank_01_optim_states.pt... +[default0]:[2023-02-16 17:28:03,734] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/zero_pp_rank_24_mp_rank_00_optim_states.pt... +[default2]:[2023-02-16 17:28:03,734] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/zero_pp_rank_17_mp_rank_00_optim_states.pt... +[default4]:[2023-02-16 17:28:03,734] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/zero_pp_rank_14_mp_rank_00_optim_states.pt... +[default2]:[2023-02-16 17:28:03,734] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/zero_pp_rank_25_mp_rank_00_optim_states.pt... +[default2]:[2023-02-16 17:28:03,734] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/zero_pp_rank_29_mp_rank_00_optim_states.pt... +[default6]:[2023-02-16 17:28:03,734] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/zero_pp_rank_31_mp_rank_00_optim_states.pt... +[default5]:[2023-02-16 17:28:03,734] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/zero_pp_rank_18_mp_rank_01_optim_states.pt... +[default6]:[2023-02-16 17:28:03,734] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/zero_pp_rank_19_mp_rank_00_optim_states.pt... +[default0]:[2023-02-16 17:28:03,734] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/zero_pp_rank_12_mp_rank_00_optim_states.pt... +[default2]:[2023-02-16 17:28:03,734] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/zero_pp_rank_5_mp_rank_00_optim_states.pt... +[default0]:[2023-02-16 17:28:03,708] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/layer_37-model_00-model_states.pt. +[default0]:[2023-02-16 17:28:03,709] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/mp_rank_00_model_states.pt +[default0]:[2023-02-16 17:28:03,709] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/mp_rank_00_model_states.pt... +[default0]:[2023-02-16 17:28:03,711] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/mp_rank_00_model_states.pt. +[default7]:[2023-02-16 17:28:03,734] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/zero_pp_rank_3_mp_rank_01_optim_states.pt... +[default5]:[2023-02-16 17:28:03,734] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/zero_pp_rank_2_mp_rank_01_optim_states.pt... +[default2]:[2023-02-16 17:28:03,734] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/zero_pp_rank_9_mp_rank_00_optim_states.pt... +[default6]:[2023-02-16 17:28:03,734] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/zero_pp_rank_23_mp_rank_00_optim_states.pt... +[default3]:[2023-02-16 17:28:03,734] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/zero_pp_rank_21_mp_rank_01_optim_states.pt... +[default4]:[2023-02-16 17:28:03,734] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/zero_pp_rank_30_mp_rank_00_optim_states.pt... +[default6]:[2023-02-16 17:28:03,734] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/zero_pp_rank_7_mp_rank_00_optim_states.pt... +[default0]:[2023-02-16 17:28:03,734] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/zero_pp_rank_8_mp_rank_00_optim_states.pt... +[default0]:[2023-02-16 17:28:03,734] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/zero_pp_rank_4_mp_rank_00_optim_states.pt... +[default5]:[2023-02-16 17:28:03,734] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/zero_pp_rank_22_mp_rank_01_optim_states.pt... +[default1]:[2023-02-16 17:28:03,734] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/zero_pp_rank_12_mp_rank_01_optim_states.pt... +[default4]:[2023-02-16 17:28:03,734] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/zero_pp_rank_22_mp_rank_00_optim_states.pt... +[default0]:[2023-02-16 17:28:03,734] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/zero_pp_rank_16_mp_rank_00_optim_states.pt... +[default5]:[2023-02-16 17:28:03,734] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/zero_pp_rank_30_mp_rank_01_optim_states.pt... +[default3]:[2023-02-16 17:28:03,734] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/zero_pp_rank_25_mp_rank_01_optim_states.pt... +[default1]:[2023-02-16 17:28:03,734] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/zero_pp_rank_4_mp_rank_01_optim_states.pt... +[default5]:[2023-02-16 17:28:03,734] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/zero_pp_rank_10_mp_rank_01_optim_states.pt... +[default0]:[2023-02-16 17:28:03,734] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/zero_pp_rank_28_mp_rank_00_optim_states.pt... +[default2]:[2023-02-16 17:28:03,734] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/zero_pp_rank_13_mp_rank_00_optim_states.pt... +[default3]:[2023-02-16 17:28:03,734] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/zero_pp_rank_13_mp_rank_01_optim_states.pt... +[default3]:[2023-02-16 17:28:03,734] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/zero_pp_rank_17_mp_rank_01_optim_states.pt... +[default4]:[2023-02-16 17:28:03,734] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/zero_pp_rank_18_mp_rank_00_optim_states.pt... +[default5]:[2023-02-16 17:28:03,734] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/zero_pp_rank_26_mp_rank_01_optim_states.pt... +[default3]:[2023-02-16 17:28:03,734] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/zero_pp_rank_5_mp_rank_01_optim_states.pt... +[default7]:[2023-02-16 17:28:03,734] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/zero_pp_rank_19_mp_rank_01_optim_states.pt... +[default1]:[2023-02-16 17:28:03,734] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/zero_pp_rank_8_mp_rank_01_optim_states.pt... +[default7]:[2023-02-16 17:28:03,734] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/zero_pp_rank_11_mp_rank_01_optim_states.pt... +[default6]:[2023-02-16 17:28:03,734] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/zero_pp_rank_11_mp_rank_00_optim_states.pt... +[default1]:[2023-02-16 17:28:03,734] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/zero_pp_rank_20_mp_rank_01_optim_states.pt... +[default1]:[2023-02-16 17:28:03,703] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/layer_35-model_01-model_states.pt. +[default1]:[2023-02-16 17:28:03,703] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/layer_37-model_01-model_states.pt... +[default1]:[2023-02-16 17:28:03,704] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/layer_37-model_01-model_states.pt. +[default1]:[2023-02-16 17:28:03,705] [INFO] [logging.py:68:log_dist] [Rank 1] Saving model checkpoint: /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/mp_rank_01_model_states.pt +[default1]:[2023-02-16 17:28:03,705] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/mp_rank_01_model_states.pt... +[default1]:[2023-02-16 17:28:03,707] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/mp_rank_01_model_states.pt. +[default1]:[2023-02-16 17:28:03,734] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/zero_pp_rank_0_mp_rank_01_optim_states.pt... +[default4]:[2023-02-16 17:28:03,734] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/zero_pp_rank_2_mp_rank_00_optim_states.pt... +[default2]:[2023-02-16 17:28:03,734] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/zero_pp_rank_1_mp_rank_00_optim_states.pt... +[default6]:[2023-02-16 17:28:03,734] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/zero_pp_rank_27_mp_rank_00_optim_states.pt... +[default4]:[2023-02-16 17:28:03,734] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/zero_pp_rank_10_mp_rank_00_optim_states.pt... +[default5]:[2023-02-16 17:28:03,734] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/zero_pp_rank_6_mp_rank_01_optim_states.pt... +[default1]:[2023-02-16 17:28:03,734] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/zero_pp_rank_16_mp_rank_01_optim_states.pt... +[default3]:[2023-02-16 17:28:03,734] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/zero_pp_rank_9_mp_rank_01_optim_states.pt... +[default4]:[2023-02-16 17:28:03,734] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/zero_pp_rank_26_mp_rank_00_optim_states.pt... +[default1]:[2023-02-16 17:28:03,734] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/zero_pp_rank_24_mp_rank_01_optim_states.pt... +[default4]:[2023-02-16 17:28:03,734] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/zero_pp_rank_6_mp_rank_00_optim_states.pt... +[default3]:[2023-02-16 17:28:03,734] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/zero_pp_rank_29_mp_rank_01_optim_states.pt... +[default7]:[2023-02-16 17:28:03,734] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/zero_pp_rank_31_mp_rank_01_optim_states.pt... +[default7]:[2023-02-16 17:28:03,734] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/zero_pp_rank_15_mp_rank_01_optim_states.pt... +[default5]:[2023-02-16 17:28:03,734] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/zero_pp_rank_14_mp_rank_01_optim_states.pt... +[default6]:[2023-02-16 17:28:03,734] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/zero_pp_rank_3_mp_rank_00_optim_states.pt... +[default3]:[2023-02-16 17:28:03,734] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/zero_pp_rank_1_mp_rank_01_optim_states.pt... +[default0]:[2023-02-16 17:28:03,734] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/zero_pp_rank_0_mp_rank_00_optim_states.pt... +[default4]:[2023-02-16 17:28:04,258] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/zero_pp_rank_30_mp_rank_00_optim_states.pt. +[default4]:[2023-02-16 17:28:04,258] [INFO] [engine.py:3199:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/zero_pp_rank_30_mp_rank_00_optim_states.pt +[default4]:[2023-02-16 17:28:04,258] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3250 is ready now! +[default3]:[2023-02-16 17:28:04,283] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/zero_pp_rank_17_mp_rank_01_optim_states.pt. +[default3]:[2023-02-16 17:28:04,283] [INFO] [engine.py:3199:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/zero_pp_rank_17_mp_rank_01_optim_states.pt +[default3]:[2023-02-16 17:28:04,283] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3250 is ready now! +[default5]:[2023-02-16 17:28:04,245] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/zero_pp_rank_26_mp_rank_01_optim_states.pt. +[default5]:[2023-02-16 17:28:04,246] [INFO] [engine.py:3199:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/zero_pp_rank_26_mp_rank_01_optim_states.pt +[default5]:[2023-02-16 17:28:04,246] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3250 is ready now! +[default1]:[2023-02-16 17:28:04,237] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/zero_pp_rank_8_mp_rank_01_optim_states.pt. +[default1]:[2023-02-16 17:28:04,237] [INFO] [engine.py:3199:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/zero_pp_rank_8_mp_rank_01_optim_states.pt +[default1]:[2023-02-16 17:28:04,238] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3250 is ready now! +[default1]:[2023-02-16 17:28:04,271] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/zero_pp_rank_20_mp_rank_01_optim_states.pt. +[default1]:[2023-02-16 17:28:04,271] [INFO] [engine.py:3199:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/zero_pp_rank_20_mp_rank_01_optim_states.pt +[default1]:[2023-02-16 17:28:04,271] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3250 is ready now! +[default4]:[2023-02-16 17:28:04,235] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/zero_pp_rank_10_mp_rank_00_optim_states.pt. +[default4]:[2023-02-16 17:28:04,235] [INFO] [engine.py:3199:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/zero_pp_rank_10_mp_rank_00_optim_states.pt +[default4]:[2023-02-16 17:28:04,235] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3250 is ready now! +[default3]:[2023-02-16 17:28:04,254] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/zero_pp_rank_9_mp_rank_01_optim_states.pt. +[default3]:[2023-02-16 17:28:04,254] [INFO] [engine.py:3199:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/zero_pp_rank_9_mp_rank_01_optim_states.pt +[default3]:[2023-02-16 17:28:04,254] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3250 is ready now! +[default3]:[2023-02-16 17:28:04,274] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/zero_pp_rank_29_mp_rank_01_optim_states.pt. +[default3]:[2023-02-16 17:28:04,274] [INFO] [engine.py:3199:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/zero_pp_rank_29_mp_rank_01_optim_states.pt +[default3]:[2023-02-16 17:28:04,275] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3250 is ready now! +[default0]:[2023-02-16 17:28:04,319] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/zero_pp_rank_0_mp_rank_00_optim_states.pt. +[default0]:[2023-02-16 17:28:04,321] [INFO] [engine.py:3199:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/zero_pp_rank_0_mp_rank_00_optim_states.pt +[default0]:[2023-02-16 17:28:04,321] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3250 is ready now! +[default0]:[2023-02-16 17:28:04,253] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/zero_pp_rank_20_mp_rank_00_optim_states.pt. +[default0]:[2023-02-16 17:28:04,253] [INFO] [engine.py:3199:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/zero_pp_rank_20_mp_rank_00_optim_states.pt +[default0]:[2023-02-16 17:28:04,253] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3250 is ready now! +[default2]:[2023-02-16 17:28:04,309] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/zero_pp_rank_17_mp_rank_00_optim_states.pt. +[default2]:[2023-02-16 17:28:04,309] [INFO] [engine.py:3199:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/zero_pp_rank_17_mp_rank_00_optim_states.pt +[default2]:[2023-02-16 17:28:04,309] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3250 is ready now! +[default4]:[2023-02-16 17:28:04,343] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/zero_pp_rank_14_mp_rank_00_optim_states.pt. +[default4]:[2023-02-16 17:28:04,343] [INFO] [engine.py:3199:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/zero_pp_rank_14_mp_rank_00_optim_states.pt +[default4]:[2023-02-16 17:28:04,343] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3250 is ready now! +[default5]:[2023-02-16 17:28:04,309] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/zero_pp_rank_18_mp_rank_01_optim_states.pt. +[default5]:[2023-02-16 17:28:04,309] [INFO] [engine.py:3199:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/zero_pp_rank_18_mp_rank_01_optim_states.pt +[default5]:[2023-02-16 17:28:04,309] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3250 is ready now! +[default2]:[2023-02-16 17:28:04,391] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/zero_pp_rank_1_mp_rank_00_optim_states.pt. +[default2]:[2023-02-16 17:28:04,391] [INFO] [engine.py:3199:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/zero_pp_rank_1_mp_rank_00_optim_states.pt +[default2]:[2023-02-16 17:28:04,391] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3250 is ready now! +[default5]:[2023-02-16 17:28:04,332] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/zero_pp_rank_10_mp_rank_01_optim_states.pt. +[default5]:[2023-02-16 17:28:04,332] [INFO] [engine.py:3199:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/zero_pp_rank_10_mp_rank_01_optim_states.pt +[default5]:[2023-02-16 17:28:04,332] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3250 is ready now! +[default6]:[2023-02-16 17:28:04,359] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/zero_pp_rank_11_mp_rank_00_optim_states.pt. +[default6]:[2023-02-16 17:28:04,359] [INFO] [engine.py:3199:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/zero_pp_rank_11_mp_rank_00_optim_states.pt +[default6]:[2023-02-16 17:28:04,359] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3250 is ready now! +[default7]:[2023-02-16 17:28:04,360] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/zero_pp_rank_11_mp_rank_01_optim_states.pt. +[default7]:[2023-02-16 17:28:04,360] [INFO] [engine.py:3199:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/zero_pp_rank_11_mp_rank_01_optim_states.pt +[default7]:[2023-02-16 17:28:04,360] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3250 is ready now! +[default1]:[2023-02-16 17:28:04,347] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/zero_pp_rank_16_mp_rank_01_optim_states.pt. +[default1]:[2023-02-16 17:28:04,347] [INFO] [engine.py:3199:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/zero_pp_rank_16_mp_rank_01_optim_states.pt +[default1]:[2023-02-16 17:28:04,347] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3250 is ready now! +[default3]:[2023-02-16 17:28:04,392] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/zero_pp_rank_1_mp_rank_01_optim_states.pt. +[default3]:[2023-02-16 17:28:04,393] [INFO] [engine.py:3199:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/zero_pp_rank_1_mp_rank_01_optim_states.pt +[default3]:[2023-02-16 17:28:04,393] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3250 is ready now! +[default7]:[2023-02-16 17:28:04,413] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/zero_pp_rank_31_mp_rank_01_optim_states.pt. +[default7]:[2023-02-16 17:28:04,413] [INFO] [engine.py:3199:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/zero_pp_rank_31_mp_rank_01_optim_states.pt +[default7]:[2023-02-16 17:28:04,413] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3250 is ready now! +[default5]:[2023-02-16 17:28:04,373] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/zero_pp_rank_14_mp_rank_01_optim_states.pt. +[default5]:[2023-02-16 17:28:04,373] [INFO] [engine.py:3199:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/zero_pp_rank_14_mp_rank_01_optim_states.pt +[default5]:[2023-02-16 17:28:04,374] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3250 is ready now! +[default2]:[2023-02-16 17:28:04,427] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/zero_pp_rank_21_mp_rank_00_optim_states.pt. +[default2]:[2023-02-16 17:28:04,427] [INFO] [engine.py:3199:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/zero_pp_rank_21_mp_rank_00_optim_states.pt +[default2]:[2023-02-16 17:28:04,427] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3250 is ready now! +[default1]:[2023-02-16 17:28:04,410] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/zero_pp_rank_28_mp_rank_01_optim_states.pt. +[default1]:[2023-02-16 17:28:04,410] [INFO] [engine.py:3199:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/zero_pp_rank_28_mp_rank_01_optim_states.pt +[default1]:[2023-02-16 17:28:04,410] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3250 is ready now! +[default7]:[2023-02-16 17:28:04,429] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/zero_pp_rank_27_mp_rank_01_optim_states.pt. +[default7]:[2023-02-16 17:28:04,430] [INFO] [engine.py:3199:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/zero_pp_rank_27_mp_rank_01_optim_states.pt +[default7]:[2023-02-16 17:28:04,430] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3250 is ready now! +[default7]:[2023-02-16 17:28:04,421] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/zero_pp_rank_3_mp_rank_01_optim_states.pt. +[default7]:[2023-02-16 17:28:04,421] [INFO] [engine.py:3199:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/zero_pp_rank_3_mp_rank_01_optim_states.pt +[default7]:[2023-02-16 17:28:04,421] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3250 is ready now! +[default6]:[2023-02-16 17:28:04,424] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/zero_pp_rank_31_mp_rank_00_optim_states.pt. +[default6]:[2023-02-16 17:28:04,425] [INFO] [engine.py:3199:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/zero_pp_rank_31_mp_rank_00_optim_states.pt +[default6]:[2023-02-16 17:28:04,425] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3250 is ready now! +[default5]:[2023-02-16 17:28:04,383] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/zero_pp_rank_2_mp_rank_01_optim_states.pt. +[default5]:[2023-02-16 17:28:04,383] [INFO] [engine.py:3199:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/zero_pp_rank_2_mp_rank_01_optim_states.pt +[default5]:[2023-02-16 17:28:04,383] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3250 is ready now! +[default6]:[2023-02-16 17:28:04,401] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/zero_pp_rank_19_mp_rank_00_optim_states.pt. +[default6]:[2023-02-16 17:28:04,402] [INFO] [engine.py:3199:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/zero_pp_rank_19_mp_rank_00_optim_states.pt +[default6]:[2023-02-16 17:28:04,402] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3250 is ready now! +[default0]:[2023-02-16 17:28:04,431] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/zero_pp_rank_12_mp_rank_00_optim_states.pt. +[default0]:[2023-02-16 17:28:04,431] [INFO] [engine.py:3199:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/zero_pp_rank_12_mp_rank_00_optim_states.pt +[default0]:[2023-02-16 17:28:04,431] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3250 is ready now! +[default2]:[2023-02-16 17:28:04,386] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/zero_pp_rank_9_mp_rank_00_optim_states.pt. +[default2]:[2023-02-16 17:28:04,386] [INFO] [engine.py:3199:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/zero_pp_rank_9_mp_rank_00_optim_states.pt +[default2]:[2023-02-16 17:28:04,386] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3250 is ready now! +[default6]:[2023-02-16 17:28:04,401] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/zero_pp_rank_23_mp_rank_00_optim_states.pt. +[default6]:[2023-02-16 17:28:04,401] [INFO] [engine.py:3199:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/zero_pp_rank_23_mp_rank_00_optim_states.pt +[default6]:[2023-02-16 17:28:04,401] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3250 is ready now! +[default3]:[2023-02-16 17:28:04,467] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/zero_pp_rank_21_mp_rank_01_optim_states.pt. +[default3]:[2023-02-16 17:28:04,468] [INFO] [engine.py:3199:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/zero_pp_rank_21_mp_rank_01_optim_states.pt +[default3]:[2023-02-16 17:28:04,468] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3250 is ready now! +[default0]:[2023-02-16 17:28:04,378] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/zero_pp_rank_8_mp_rank_00_optim_states.pt. +[default0]:[2023-02-16 17:28:04,378] [INFO] [engine.py:3199:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/zero_pp_rank_8_mp_rank_00_optim_states.pt +[default0]:[2023-02-16 17:28:04,378] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3250 is ready now! +[default1]:[2023-02-16 17:28:04,441] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/zero_pp_rank_12_mp_rank_01_optim_states.pt. +[default1]:[2023-02-16 17:28:04,442] [INFO] [engine.py:3199:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/zero_pp_rank_12_mp_rank_01_optim_states.pt +[default1]:[2023-02-16 17:28:04,442] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3250 is ready now! +[default1]:[2023-02-16 17:28:04,433] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/zero_pp_rank_0_mp_rank_01_optim_states.pt. +[default1]:[2023-02-16 17:28:04,433] [INFO] [engine.py:3199:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/zero_pp_rank_0_mp_rank_01_optim_states.pt +[default1]:[2023-02-16 17:28:04,433] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3250 is ready now! +[default4]:[2023-02-16 17:28:04,424] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/zero_pp_rank_2_mp_rank_00_optim_states.pt. +[default4]:[2023-02-16 17:28:04,424] [INFO] [engine.py:3199:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/zero_pp_rank_2_mp_rank_00_optim_states.pt +[default4]:[2023-02-16 17:28:04,425] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3250 is ready now! +[default0]:[2023-02-16 17:28:04,436] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/zero_pp_rank_16_mp_rank_00_optim_states.pt. +[default0]:[2023-02-16 17:28:04,436] [INFO] [engine.py:3199:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/zero_pp_rank_16_mp_rank_00_optim_states.pt +[default0]:[2023-02-16 17:28:04,436] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3250 is ready now! +[default5]:[2023-02-16 17:28:04,410] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/zero_pp_rank_30_mp_rank_01_optim_states.pt. +[default5]:[2023-02-16 17:28:04,410] [INFO] [engine.py:3199:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/zero_pp_rank_30_mp_rank_01_optim_states.pt +[default5]:[2023-02-16 17:28:04,410] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3250 is ready now! +[default0]:[2023-02-16 17:28:04,429] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/zero_pp_rank_28_mp_rank_00_optim_states.pt. +[default0]:[2023-02-16 17:28:04,429] [INFO] [engine.py:3199:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/zero_pp_rank_28_mp_rank_00_optim_states.pt +[default0]:[2023-02-16 17:28:04,430] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3250 is ready now! +[default4]:[2023-02-16 17:28:04,403] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/zero_pp_rank_18_mp_rank_00_optim_states.pt. +[default4]:[2023-02-16 17:28:04,403] [INFO] [engine.py:3199:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/zero_pp_rank_18_mp_rank_00_optim_states.pt +[default4]:[2023-02-16 17:28:04,403] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3250 is ready now! +[default3]:[2023-02-16 17:28:04,456] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/zero_pp_rank_13_mp_rank_01_optim_states.pt. +[default3]:[2023-02-16 17:28:04,456] [INFO] [engine.py:3199:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/zero_pp_rank_13_mp_rank_01_optim_states.pt +[default3]:[2023-02-16 17:28:04,456] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3250 is ready now! +[default2]:[2023-02-16 17:28:04,480] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/zero_pp_rank_13_mp_rank_00_optim_states.pt. +[default2]:[2023-02-16 17:28:04,480] [INFO] [engine.py:3199:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/zero_pp_rank_13_mp_rank_00_optim_states.pt +[default2]:[2023-02-16 17:28:04,480] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3250 is ready now! +[default7]:[2023-02-16 17:28:04,433] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/zero_pp_rank_19_mp_rank_01_optim_states.pt. +[default7]:[2023-02-16 17:28:04,433] [INFO] [engine.py:3199:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/zero_pp_rank_19_mp_rank_01_optim_states.pt +[default7]:[2023-02-16 17:28:04,433] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3250 is ready now! +[default6]:[2023-02-16 17:28:04,507] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/zero_pp_rank_27_mp_rank_00_optim_states.pt. +[default6]:[2023-02-16 17:28:04,507] [INFO] [engine.py:3199:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/zero_pp_rank_27_mp_rank_00_optim_states.pt +[default6]:[2023-02-16 17:28:04,507] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3250 is ready now! +[default6]:[2023-02-16 17:28:04,429] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/zero_pp_rank_3_mp_rank_00_optim_states.pt. +[default6]:[2023-02-16 17:28:04,429] [INFO] [engine.py:3199:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/zero_pp_rank_3_mp_rank_00_optim_states.pt +[default6]:[2023-02-16 17:28:04,429] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3250 is ready now! +[default7]:[2023-02-16 17:28:04,543] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/zero_pp_rank_23_mp_rank_01_optim_states.pt. +[default7]:[2023-02-16 17:28:04,543] [INFO] [engine.py:3199:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/zero_pp_rank_23_mp_rank_01_optim_states.pt +[default7]:[2023-02-16 17:28:04,543] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3250 is ready now! +[default0]:[2023-02-16 17:28:04,489] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/zero_pp_rank_24_mp_rank_00_optim_states.pt. +[default0]:[2023-02-16 17:28:04,489] [INFO] [engine.py:3199:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/zero_pp_rank_24_mp_rank_00_optim_states.pt +[default0]:[2023-02-16 17:28:04,489] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3250 is ready now! +[default2]:[2023-02-16 17:28:04,501] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/zero_pp_rank_25_mp_rank_00_optim_states.pt. +[default2]:[2023-02-16 17:28:04,501] [INFO] [engine.py:3199:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/zero_pp_rank_25_mp_rank_00_optim_states.pt +[default2]:[2023-02-16 17:28:04,501] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3250 is ready now! +[default2]:[2023-02-16 17:28:04,460] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/zero_pp_rank_29_mp_rank_00_optim_states.pt. +[default2]:[2023-02-16 17:28:04,460] [INFO] [engine.py:3199:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/zero_pp_rank_29_mp_rank_00_optim_states.pt +[default2]:[2023-02-16 17:28:04,460] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3250 is ready now! +[default2]:[2023-02-16 17:28:04,504] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/zero_pp_rank_5_mp_rank_00_optim_states.pt. +[default2]:[2023-02-16 17:28:04,505] [INFO] [engine.py:3199:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/zero_pp_rank_5_mp_rank_00_optim_states.pt +[default2]:[2023-02-16 17:28:04,505] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3250 is ready now! +[default0]:[2023-02-16 17:28:04,536] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/zero_pp_rank_4_mp_rank_00_optim_states.pt. +[default0]:[2023-02-16 17:28:04,536] [INFO] [engine.py:3199:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/zero_pp_rank_4_mp_rank_00_optim_states.pt +[default0]:[2023-02-16 17:28:04,536] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3250 is ready now! +[default5]:[2023-02-16 17:28:04,559] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/zero_pp_rank_22_mp_rank_01_optim_states.pt. +[default5]:[2023-02-16 17:28:04,559] [INFO] [engine.py:3199:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/zero_pp_rank_22_mp_rank_01_optim_states.pt +[default5]:[2023-02-16 17:28:04,559] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3250 is ready now! +[default4]:[2023-02-16 17:28:04,552] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/zero_pp_rank_22_mp_rank_00_optim_states.pt. +[default4]:[2023-02-16 17:28:04,552] [INFO] [engine.py:3199:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/zero_pp_rank_22_mp_rank_00_optim_states.pt +[default4]:[2023-02-16 17:28:04,552] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3250 is ready now! +[default3]:[2023-02-16 17:28:04,492] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/zero_pp_rank_25_mp_rank_01_optim_states.pt. +[default3]:[2023-02-16 17:28:04,492] [INFO] [engine.py:3199:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/zero_pp_rank_25_mp_rank_01_optim_states.pt +[default3]:[2023-02-16 17:28:04,492] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3250 is ready now! +[default5]:[2023-02-16 17:28:04,521] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/zero_pp_rank_6_mp_rank_01_optim_states.pt. +[default5]:[2023-02-16 17:28:04,521] [INFO] [engine.py:3199:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/zero_pp_rank_6_mp_rank_01_optim_states.pt +[default5]:[2023-02-16 17:28:04,521] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3250 is ready now! +[default4]:[2023-02-16 17:28:04,542] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/zero_pp_rank_26_mp_rank_00_optim_states.pt. +[default4]:[2023-02-16 17:28:04,542] [INFO] [engine.py:3199:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/zero_pp_rank_26_mp_rank_00_optim_states.pt +[default4]:[2023-02-16 17:28:04,542] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3250 is ready now! +[default1]:[2023-02-16 17:28:04,527] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/zero_pp_rank_24_mp_rank_01_optim_states.pt. +[default1]:[2023-02-16 17:28:04,527] [INFO] [engine.py:3199:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/zero_pp_rank_24_mp_rank_01_optim_states.pt +[default1]:[2023-02-16 17:28:04,527] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3250 is ready now! +[default4]:[2023-02-16 17:28:04,618] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/zero_pp_rank_6_mp_rank_00_optim_states.pt. +[default4]:[2023-02-16 17:28:04,618] [INFO] [engine.py:3199:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/zero_pp_rank_6_mp_rank_00_optim_states.pt +[default4]:[2023-02-16 17:28:04,618] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3250 is ready now! +[default1]:[2023-02-16 17:28:04,669] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/zero_pp_rank_4_mp_rank_01_optim_states.pt. +[default1]:[2023-02-16 17:28:04,670] [INFO] [engine.py:3199:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/zero_pp_rank_4_mp_rank_01_optim_states.pt +[default1]:[2023-02-16 17:28:04,670] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3250 is ready now! +[default3]:[2023-02-16 17:28:04,673] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/zero_pp_rank_5_mp_rank_01_optim_states.pt. +[default3]:[2023-02-16 17:28:04,673] [INFO] [engine.py:3199:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/zero_pp_rank_5_mp_rank_01_optim_states.pt +[default3]:[2023-02-16 17:28:04,673] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3250 is ready now! +[default7]:[2023-02-16 17:28:04,639] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/zero_pp_rank_15_mp_rank_01_optim_states.pt. +[default7]:[2023-02-16 17:28:04,639] [INFO] [engine.py:3199:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/zero_pp_rank_15_mp_rank_01_optim_states.pt +[default7]:[2023-02-16 17:28:04,640] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3250 is ready now! +[default6]:[2023-02-16 17:28:04,645] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/zero_pp_rank_15_mp_rank_00_optim_states.pt. +[default6]:[2023-02-16 17:28:04,645] [INFO] [engine.py:3199:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/zero_pp_rank_15_mp_rank_00_optim_states.pt +[default6]:[2023-02-16 17:28:04,645] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3250 is ready now! +[default0]: successfully saved checkpoint at iteration 3250 to /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main +[default7]:time (ms) | save-checkpoint: 3224.67 +[default7]:[2023-02-16 17:28:04,902] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/zero_pp_rank_7_mp_rank_01_optim_states.pt. +[default7]:[2023-02-16 17:28:04,903] [INFO] [engine.py:3199:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/zero_pp_rank_7_mp_rank_01_optim_states.pt +[default7]:[2023-02-16 17:28:04,903] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3250 is ready now! +[default6]:[2023-02-16 17:28:04,898] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/zero_pp_rank_7_mp_rank_00_optim_states.pt. +[default6]:[2023-02-16 17:28:04,898] [INFO] [engine.py:3199:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/zero_pp_rank_7_mp_rank_00_optim_states.pt +[default6]:[2023-02-16 17:28:04,898] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3250 is ready now! +[default7]: iteration 3251/ 3814 | consumed samples: 1664512 | consumed tokens: 3408920576 | elapsed time per iteration (s): 8.69 | learning rate: 6.418E-06 | global batch size: 512 | lm loss: 4.850353E+00 | loss scale: 262144.0 | grad norm: 0.083 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 58.885 | TFLOPs: 44.48 | +[default7]: iteration 3252/ 3814 | consumed samples: 1665024 | consumed tokens: 3409969152 | elapsed time per iteration (s): 5.44 | learning rate: 6.396E-06 | global batch size: 512 | lm loss: 4.876642E+00 | loss scale: 262144.0 | grad norm: 0.088 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.037 | TFLOPs: 71.03 | +[default7]: iteration 3253/ 3814 | consumed samples: 1665536 | consumed tokens: 3411017728 | elapsed time per iteration (s): 5.42 | learning rate: 6.373E-06 | global batch size: 512 | lm loss: 4.890462E+00 | loss scale: 262144.0 | grad norm: 0.081 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.497 | TFLOPs: 71.37 | +[default7]: iteration 3254/ 3814 | consumed samples: 1666048 | consumed tokens: 3412066304 | elapsed time per iteration (s): 5.43 | learning rate: 6.351E-06 | global batch size: 512 | lm loss: 4.895408E+00 | loss scale: 262144.0 | grad norm: 0.087 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.222 | TFLOPs: 71.17 | +[default7]: iteration 3255/ 3814 | consumed samples: 1666560 | consumed tokens: 3413114880 | elapsed time per iteration (s): 5.44 | learning rate: 6.329E-06 | global batch size: 512 | lm loss: 4.890627E+00 | loss scale: 262144.0 | grad norm: 0.081 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.181 | TFLOPs: 71.14 | +[default7]: iteration 3256/ 3814 | consumed samples: 1667072 | consumed tokens: 3414163456 | elapsed time per iteration (s): 5.43 | learning rate: 6.307E-06 | global batch size: 512 | lm loss: 4.912820E+00 | loss scale: 262144.0 | grad norm: 0.082 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.316 | TFLOPs: 71.24 | +[default7]: iteration 3257/ 3814 | consumed samples: 1667584 | consumed tokens: 3415212032 | elapsed time per iteration (s): 5.44 | learning rate: 6.285E-06 | global batch size: 512 | lm loss: 4.884098E+00 | loss scale: 262144.0 | grad norm: 0.078 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.143 | TFLOPs: 71.11 | +[default7]: iteration 3258/ 3814 | consumed samples: 1668096 | consumed tokens: 3416260608 | elapsed time per iteration (s): 5.44 | learning rate: 6.263E-06 | global batch size: 512 | lm loss: 4.883155E+00 | loss scale: 262144.0 | grad norm: 0.082 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.097 | TFLOPs: 71.07 | +[default7]: iteration 3259/ 3814 | consumed samples: 1668608 | consumed tokens: 3417309184 | elapsed time per iteration (s): 5.44 | learning rate: 6.241E-06 | global batch size: 512 | lm loss: 4.871851E+00 | loss scale: 262144.0 | grad norm: 0.084 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.108 | TFLOPs: 71.08 | +[default7]: iteration 3260/ 3814 | consumed samples: 1669120 | consumed tokens: 3418357760 | elapsed time per iteration (s): 5.49 | learning rate: 6.219E-06 | global batch size: 512 | lm loss: 4.890221E+00 | loss scale: 262144.0 | grad norm: 0.081 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.312 | TFLOPs: 70.48 | +[default7]: iteration 3261/ 3814 | consumed samples: 1669632 | consumed tokens: 3419406336 | elapsed time per iteration (s): 5.47 | learning rate: 6.197E-06 | global batch size: 512 | lm loss: 4.870077E+00 | loss scale: 262144.0 | grad norm: 0.082 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.654 | TFLOPs: 70.74 | +[default7]: iteration 3262/ 3814 | consumed samples: 1670144 | consumed tokens: 3420454912 | elapsed time per iteration (s): 5.45 | learning rate: 6.175E-06 | global batch size: 512 | lm loss: 4.908275E+00 | loss scale: 262144.0 | grad norm: 0.085 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.876 | TFLOPs: 70.91 | +[default7]: iteration 3263/ 3814 | consumed samples: 1670656 | consumed tokens: 3421503488 | elapsed time per iteration (s): 5.43 | learning rate: 6.153E-06 | global batch size: 512 | lm loss: 4.871660E+00 | loss scale: 262144.0 | grad norm: 0.091 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.276 | TFLOPs: 71.21 | +[default7]: iteration 3264/ 3814 | consumed samples: 1671168 | consumed tokens: 3422552064 | elapsed time per iteration (s): 5.47 | learning rate: 6.131E-06 | global batch size: 512 | lm loss: 4.879763E+00 | loss scale: 262144.0 | grad norm: 0.084 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.553 | TFLOPs: 70.66 | +[default7]: iteration 3265/ 3814 | consumed samples: 1671680 | consumed tokens: 3423600640 | elapsed time per iteration (s): 5.46 | learning rate: 6.110E-06 | global batch size: 512 | lm loss: 4.890042E+00 | loss scale: 262144.0 | grad norm: 0.092 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.699 | TFLOPs: 70.77 | +[default7]: iteration 3266/ 3814 | consumed samples: 1672192 | consumed tokens: 3424649216 | elapsed time per iteration (s): 5.46 | learning rate: 6.088E-06 | global batch size: 512 | lm loss: 4.870063E+00 | loss scale: 262144.0 | grad norm: 0.093 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.787 | TFLOPs: 70.84 | +[default7]: iteration 3267/ 3814 | consumed samples: 1672704 | consumed tokens: 3425697792 | elapsed time per iteration (s): 5.46 | learning rate: 6.066E-06 | global batch size: 512 | lm loss: 4.861903E+00 | loss scale: 262144.0 | grad norm: 0.093 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.720 | TFLOPs: 70.79 | +[default7]: iteration 3268/ 3814 | consumed samples: 1673216 | consumed tokens: 3426746368 | elapsed time per iteration (s): 5.47 | learning rate: 6.045E-06 | global batch size: 512 | lm loss: 4.888389E+00 | loss scale: 262144.0 | grad norm: 0.103 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.573 | TFLOPs: 70.68 | +[default7]: iteration 3269/ 3814 | consumed samples: 1673728 | consumed tokens: 3427794944 | elapsed time per iteration (s): 5.47 | learning rate: 6.023E-06 | global batch size: 512 | lm loss: 4.902473E+00 | loss scale: 262144.0 | grad norm: 0.078 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.640 | TFLOPs: 70.73 | +[default7]: iteration 3270/ 3814 | consumed samples: 1674240 | consumed tokens: 3428843520 | elapsed time per iteration (s): 5.45 | learning rate: 6.001E-06 | global batch size: 512 | lm loss: 4.852708E+00 | loss scale: 262144.0 | grad norm: 0.100 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.887 | TFLOPs: 70.91 | +[default7]: iteration 3271/ 3814 | consumed samples: 1674752 | consumed tokens: 3429892096 | elapsed time per iteration (s): 5.48 | learning rate: 5.980E-06 | global batch size: 512 | lm loss: 4.867038E+00 | loss scale: 262144.0 | grad norm: 0.084 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.461 | TFLOPs: 70.59 | +[default7]: iteration 3272/ 3814 | consumed samples: 1675264 | consumed tokens: 3430940672 | elapsed time per iteration (s): 5.47 | learning rate: 5.958E-06 | global batch size: 512 | lm loss: 4.868953E+00 | loss scale: 262144.0 | grad norm: 0.089 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.571 | TFLOPs: 70.68 | +[default7]: iteration 3273/ 3814 | consumed samples: 1675776 | consumed tokens: 3431989248 | elapsed time per iteration (s): 5.48 | learning rate: 5.937E-06 | global batch size: 512 | lm loss: 4.874695E+00 | loss scale: 262144.0 | grad norm: 0.097 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.360 | TFLOPs: 70.52 | +[default7]: iteration 3274/ 3814 | consumed samples: 1676288 | consumed tokens: 3433037824 | elapsed time per iteration (s): 5.45 | learning rate: 5.915E-06 | global batch size: 512 | lm loss: 4.873499E+00 | loss scale: 262144.0 | grad norm: 0.081 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.983 | TFLOPs: 70.99 | +[default7]: iteration 3275/ 3814 | consumed samples: 1676800 | consumed tokens: 3434086400 | elapsed time per iteration (s): 5.47 | learning rate: 5.894E-06 | global batch size: 512 | lm loss: 4.878222E+00 | loss scale: 262144.0 | grad norm: 0.106 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.664 | TFLOPs: 70.75 | +[default7]: iteration 3276/ 3814 | consumed samples: 1677312 | consumed tokens: 3435134976 | elapsed time per iteration (s): 5.47 | learning rate: 5.872E-06 | global batch size: 512 | lm loss: 4.885500E+00 | loss scale: 262144.0 | grad norm: 0.080 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.652 | TFLOPs: 70.74 | +[default7]: iteration 3277/ 3814 | consumed samples: 1677824 | consumed tokens: 3436183552 | elapsed time per iteration (s): 5.45 | learning rate: 5.851E-06 | global batch size: 512 | lm loss: 4.901652E+00 | loss scale: 262144.0 | grad norm: 0.087 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.001 | TFLOPs: 71.00 | +[default7]: iteration 3278/ 3814 | consumed samples: 1678336 | consumed tokens: 3437232128 | elapsed time per iteration (s): 5.47 | learning rate: 5.830E-06 | global batch size: 512 | lm loss: 4.891019E+00 | loss scale: 262144.0 | grad norm: 0.091 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.624 | TFLOPs: 70.72 | +[default7]: iteration 3279/ 3814 | consumed samples: 1678848 | consumed tokens: 3438280704 | elapsed time per iteration (s): 5.44 | learning rate: 5.809E-06 | global batch size: 512 | lm loss: 4.907741E+00 | loss scale: 262144.0 | grad norm: 0.085 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.162 | TFLOPs: 71.12 | +[default7]: iteration 3280/ 3814 | consumed samples: 1679360 | consumed tokens: 3439329280 | elapsed time per iteration (s): 5.42 | learning rate: 5.787E-06 | global batch size: 512 | lm loss: 4.908208E+00 | loss scale: 262144.0 | grad norm: 0.098 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.430 | TFLOPs: 71.32 | +[default7]: iteration 3281/ 3814 | consumed samples: 1679872 | consumed tokens: 3440377856 | elapsed time per iteration (s): 5.45 | learning rate: 5.766E-06 | global batch size: 512 | lm loss: 4.855869E+00 | loss scale: 262144.0 | grad norm: 0.082 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.016 | TFLOPs: 71.01 | +[default7]: iteration 3282/ 3814 | consumed samples: 1680384 | consumed tokens: 3441426432 | elapsed time per iteration (s): 5.47 | learning rate: 5.745E-06 | global batch size: 512 | lm loss: 4.873388E+00 | loss scale: 262144.0 | grad norm: 0.084 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.606 | TFLOPs: 70.70 | +[default7]: iteration 3283/ 3814 | consumed samples: 1680896 | consumed tokens: 3442475008 | elapsed time per iteration (s): 5.48 | learning rate: 5.724E-06 | global batch size: 512 | lm loss: 4.860944E+00 | loss scale: 262144.0 | grad norm: 0.085 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.511 | TFLOPs: 70.63 | +[default7]: iteration 3284/ 3814 | consumed samples: 1681408 | consumed tokens: 3443523584 | elapsed time per iteration (s): 5.47 | learning rate: 5.703E-06 | global batch size: 512 | lm loss: 4.885261E+00 | loss scale: 262144.0 | grad norm: 0.085 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.537 | TFLOPs: 70.65 | +[default7]: iteration 3285/ 3814 | consumed samples: 1681920 | consumed tokens: 3444572160 | elapsed time per iteration (s): 5.45 | learning rate: 5.682E-06 | global batch size: 512 | lm loss: 4.871666E+00 | loss scale: 262144.0 | grad norm: 0.091 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.959 | TFLOPs: 70.97 | +[default7]: iteration 3286/ 3814 | consumed samples: 1682432 | consumed tokens: 3445620736 | elapsed time per iteration (s): 5.42 | learning rate: 5.661E-06 | global batch size: 512 | lm loss: 4.914732E+00 | loss scale: 262144.0 | grad norm: 0.078 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.390 | TFLOPs: 71.29 | +[default7]: iteration 3287/ 3814 | consumed samples: 1682944 | consumed tokens: 3446669312 | elapsed time per iteration (s): 5.43 | learning rate: 5.640E-06 | global batch size: 512 | lm loss: 4.859598E+00 | loss scale: 262144.0 | grad norm: 0.085 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.276 | TFLOPs: 71.21 | +[default7]: iteration 3288/ 3814 | consumed samples: 1683456 | consumed tokens: 3447717888 | elapsed time per iteration (s): 5.46 | learning rate: 5.619E-06 | global batch size: 512 | lm loss: 4.841945E+00 | loss scale: 262144.0 | grad norm: 0.088 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.717 | TFLOPs: 70.79 | +[default7]: iteration 3289/ 3814 | consumed samples: 1683968 | consumed tokens: 3448766464 | elapsed time per iteration (s): 5.50 | learning rate: 5.598E-06 | global batch size: 512 | lm loss: 4.891581E+00 | loss scale: 262144.0 | grad norm: 0.079 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.164 | TFLOPs: 70.37 | +[default7]: iteration 3290/ 3814 | consumed samples: 1684480 | consumed tokens: 3449815040 | elapsed time per iteration (s): 5.45 | learning rate: 5.577E-06 | global batch size: 512 | lm loss: 4.858720E+00 | loss scale: 262144.0 | grad norm: 0.084 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.932 | TFLOPs: 70.95 | +[default7]: iteration 3291/ 3814 | consumed samples: 1684992 | consumed tokens: 3450863616 | elapsed time per iteration (s): 5.47 | learning rate: 5.556E-06 | global batch size: 512 | lm loss: 4.884970E+00 | loss scale: 262144.0 | grad norm: 0.080 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.672 | TFLOPs: 70.75 | +srun: Job step aborted: Waiting up to 62 seconds for job step to finish. +slurmstepd: error: *** STEP 1219025.0 ON jean-zay-iam01 CANCELLED AT 2023-02-16T17:31:50 DUE TO TIME LIMIT *** +WARNING:torch.distributed.elastic.agent.server.api:Received 15 death signal, shutting down workers +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 448221 closing signal SIGTERM +WARNING:torch.distributed.elastic.agent.server.api:Received 15 death signal, shutting down workers +WARNING:torch.distributed.elastic.agent.server.api:Received 15 death signal, shutting down workers +WARNING:torch.distributed.elastic.agent.server.api:Received 15 death signal, shutting down workers +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 208338 closing signal SIGTERM +WARNING:torch.distributed.elastic.agent.server.api:Received 15 death signal, shutting down workers +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 122121 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 4053407 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 1937692 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 448222 closing signal SIGTERM +WARNING:torch.distributed.elastic.agent.server.api:Received 15 death signal, shutting down workers +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 1937693 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 790525 closing signal SIGTERM +WARNING:torch.distributed.elastic.agent.server.api:Received 15 death signal, shutting down workers +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 2932160 closing signal SIGTERM +WARNING:torch.distributed.elastic.agent.server.api:Received 15 death signal, shutting down workers +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 448223 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 790526 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 4053408 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 122122 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 199397 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 448224 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 199398 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 208339 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 1937694 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 790527 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 208340 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 122123 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 2932161 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 122124 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 4053409 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 448225 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 199399 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 790528 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 208341 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 2932162 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 4053410 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 199400 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 790529 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 208342 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 122125 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 2932163 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 1937695 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 122126 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 448226 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 208343 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 790530 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 448227 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 790531 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 4053411 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 199401 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 448228 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 208344 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 122127 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 2932164 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 4053412 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 199402 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 1937696 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 122128 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 208345 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 2932165 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 1937697 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 199403 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 790532 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 4053413 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 2932166 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 1937698 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 4053414 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 199404 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 2932167 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 1937699 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 1937692 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 1937693 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 1937694 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 1937695 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 1937696 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 1937697 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 1937698 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 1937699 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 208338 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 208339 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 208340 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 208341 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 208342 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 208343 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 208344 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 208345 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 4053407 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 4053408 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 4053409 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 4053410 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 4053411 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 4053412 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 4053413 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 4053414 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 2932160 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 2932161 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 2932162 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 2932163 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 2932164 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 2932165 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 2932166 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 2932167 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 199397 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 199398 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 199399 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 199400 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 199401 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 199402 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 199403 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 199404 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 448221 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 448222 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 448224 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 448226 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 448227 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 448228 closing signal SIGTERM