diff --git "a/logs/main_log.txt" "b/logs/main_log.txt" --- "a/logs/main_log.txt" +++ "b/logs/main_log.txt" @@ -1,11320 +1,3 @@ -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [YES] ...... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [YES] ...... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [YES] ...... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb --------------------------------------------------............. -[NO] DeepSpeed C++/CUDA extension op report....... - --------------------------------------------------[OKAY] - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [YES] ...... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [YES] ...... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [YES] ...... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] --------------------------------------------------- --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [YES] ...... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [YES] ...... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [YES] ...... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [YES] ...... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [YES] ...... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [YES] ...... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [YES] ...... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [YES] ...... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [YES] ...... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [YES] ...... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [YES] ...... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [YES] ...... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [YES] ...... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [YES] ...... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] - [WARNING]  async_io requires the libraries: ['libaio-dev'] but are missing. Can be fixed by: `apt install libaio-dev`. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [YES] ...... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] - [WARNING]  async_io requires the libraries: ['libaio-dev'] but are missing. Can be fixed by: `apt install libaio-dev`. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [YES] ...... [OKAY] --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [YES] ...... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [YES] ...... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [YES] ...... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [YES] ...... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja - [WARNING]  async_io requires the libraries: ['libaio-dev'] but are missing. Can be fixed by: `apt install libaio-dev`. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [YES] ...... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [YES] ...... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [YES] ...... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [YES] ...... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [YES] ...... [OKAY] -fused_adam ............. [NO] ....... [OKAY] - [WARNING]  async_io requires the libraries: ['libaio-dev'] but are missing. Can be fixed by: `apt install libaio-dev`. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -transformer ............ [NO] ....... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -stochastic_transformer . [NO] ....... [OKAY] -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [YES] ...... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [YES] ...... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io requires the libraries: ['libaio-dev'] but are missing. Can be fixed by: `apt install libaio-dev`. -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja - [WARNING]  async_io requires the libraries: ['libaio-dev'] but are missing. Can be fixed by: `apt install libaio-dev`. -async_io ............... [NO] ....... [NO] - [WARNING]  async_io requires the libraries: ['libaio-dev'] but are missing. Can be fixed by: `apt install libaio-dev`. -transformer_inference .. [NO] ....... [OKAY] -async_io ............... [NO] ....... [NO] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] -transformer_inference .. [NO] ....... [OKAY] --------------------------------------------------- -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [YES] ...... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible -ninja .................. [OKAY] --------------------------------------------------- -cpu_adam ............... [YES] ...... [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -fused_adam ............. [NO] ....... [OKAY] -cpu_adam ............... [YES] ...... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [YES] ...... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [YES] ...... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] - [WARNING]  async_io requires the libraries: ['libaio-dev'] but are missing. Can be fixed by: `apt install libaio-dev`. - [WARNING]  async_io requires the libraries: ['libaio-dev'] but are missing. Can be fixed by: `apt install libaio-dev`. -async_ioasync_io .............................. [NO][NO] .............. [NO][NO] - -transformer_inferencetransformer_inference .... [NO][NO] .............. [OKAY][OKAY] - -utilsutils .................................... [YES][YES] ............ [OKAY][OKAY] - -quantizer quantizer.............. ..............[NO] [NO]....... .......[OKAY] -[OKAY] --------------------------------------------------- --------------------------------------------------- -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [YES] ...... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [YES] ...... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [YES] ...... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja - [WARNING]  async_io requires the libraries: ['libaio-dev'] but are missing. Can be fixed by: `apt install libaio-dev`. [WARNING]  async_io requires the libraries: ['libaio-dev'] but are missing. Can be fixed by: `apt install libaio-dev`. - -async_ioasync_io .............................. [NO][NO] .............. [NO][NO] - -transformer_inference transformer_inference.. ..[NO] [NO]....... ....... [OKAY][OKAY] - -utilsutils .................................... [YES][YES] ............ [OKAY][OKAY] - -quantizerquantizer ............................ [NO][NO] .............. [OKAY][OKAY] - ----------------------------------------------------------------------------------------------------- - -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [YES] ...... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [YES] ...... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [YES] ...... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/tr1-13B/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr1-13B/DeepSpeed-big-science/deepspeed'] -deepspeed info ................... 0.4.2+bc17042, bc17042, big-science -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io requires the libraries: ['libaio-dev'] but are missing. Can be fixed by: `apt install libaio-dev`. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/tr1-13B/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr1-13B/DeepSpeed-big-science/deepspeed'] -deepspeed info ................... 0.4.2+bc17042, bc17042, big-science -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja - [WARNING]  async_io requires the libraries: ['libaio-dev'] but are missing. Can be fixed by: `apt install libaio-dev`. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io requires the libraries: ['libaio-dev'] but are missing. Can be fixed by: `apt install libaio-dev`. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io requires the libraries: ['libaio-dev'] but are missing. Can be fixed by: `apt install libaio-dev`. [WARNING]  async_io requires the libraries: ['libaio-dev'] but are missing. Can be fixed by: `apt install libaio-dev`. - -async_ioasync_io .............................. [NO][NO] .............. [NO][NO] - -transformer_inference .. transformer_inference[NO] ......... [NO][OKAY] -....... [OKAY] -utils utils.................. ..................[YES] [YES]...... ......[OKAY] -[OKAY] -quantizerquantizer ............................ [NO][NO] .............. [OKAY][OKAY] - ----------------------------------------------------------------------------------------------------- - -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [YES] ...... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/tr1-13B/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr1-13B/DeepSpeed-big-science/deepspeed'] -deepspeed info ................... 0.4.2+bc17042, bc17042, big-science -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io requires the libraries: ['libaio-dev'] but are missing. Can be fixed by: `apt install libaio-dev`. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io requires the libraries: ['libaio-dev'] but are missing. Can be fixed by: `apt install libaio-dev`. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io requires the libraries: ['libaio-dev'] but are missing. Can be fixed by: `apt install libaio-dev`. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io requires the libraries: ['libaio-dev'] but are missing. Can be fixed by: `apt install libaio-dev`. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io requires the libraries: ['libaio-dev'] but are missing. Can be fixed by: `apt install libaio-dev`. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [YES] ...... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] - [WARNING]  async_io requires the libraries: ['libaio-dev'] but are missing. Can be fixed by: `apt install libaio-dev`. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja - [WARNING]  async_io requires the libraries: ['libaio-dev'] but are missing. Can be fixed by: `apt install libaio-dev`. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [YES] ...... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] - [WARNING]  async_io requires the libraries: ['libaio-dev'] but are missing. Can be fixed by: `apt install libaio-dev`. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [YES] ...... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] - [WARNING]  async_io requires the libraries: ['libaio-dev'] but are missing. Can be fixed by: `apt install libaio-dev`. -transformer ............ [NO] ....... [OKAY] -async_io ............... [NO] ....... [NO] -stochastic_transformer . [NO] ....... [OKAY] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [YES] ...... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] - [WARNING]  async_io requires the libraries: ['libaio-dev'] but are missing. Can be fixed by: `apt install libaio-dev`. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io requires the libraries: ['libaio-dev'] but are missing. Can be fixed by: `apt install libaio-dev`. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io requires the libraries: ['libaio-dev'] but are missing. Can be fixed by: `apt install libaio-dev`. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/tr1-13B/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr1-13B/DeepSpeed-big-science/deepspeed'] -deepspeed info ................... 0.4.2+bc17042, bc17042, big-science -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja - [WARNING]  async_io requires the libraries: ['libaio-dev'] but are missing. Can be fixed by: `apt install libaio-dev`. [WARNING]  async_io requires the libraries: ['libaio-dev'] but are missing. Can be fixed by: `apt install libaio-dev`. - -async_ioasync_io .............................. [NO][NO] .............. [NO][NO] - -transformer_inferencetransformer_inference .... [NO][NO] .............. [OKAY][OKAY] - -utilsutils .................................... [YES][YES] ............ [OKAY][OKAY] - -quantizerquantizer ............................ [NO][NO] .............. [OKAY][OKAY] - ----------------------------------------------------------------------------------------------------- - --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [YES] ...... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] - [WARNING]  async_io requires the libraries: ['libaio-dev'] but are missing. Can be fixed by: `apt install libaio-dev`. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io requires the libraries: ['libaio-dev'] but are missing. Can be fixed by: `apt install libaio-dev`. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -ninja .................. [OKAY] -utils .................. [YES] ...... [OKAY] --------------------------------------------------- -quantizer .............. [NO] ....... [OKAY] -op name ................ installed .. compatible --------------------------------------------------- --------------------------------------------------- -cpu_adam ............... [YES] ...... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [YES] ...... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/tr1-13B/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr1-13B/DeepSpeed-big-science/deepspeed'] -deepspeed info ................... 0.4.2+bc17042, bc17042, big-science -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io requires the libraries: ['libaio-dev'] but are missing. Can be fixed by: `apt install libaio-dev`. - [WARNING]  async_io requires the libraries: ['libaio-dev'] but are missing. Can be fixed by: `apt install libaio-dev`. -async_io ............... [NO] ....... [NO] -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -transformer_inference .. [NO] utils....... ..................[OKAY] -[YES] ...... [OKAY] -utils .................. [YES] ......quantizer [OKAY].............. - [NO] ....... quantizer[OKAY] -.............. [NO] ....... [OKAY]-------------------------------------------------- - --------------------------------------------------- -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** - [WARNING]  async_io requires the libraries: ['libaio-dev'] but are missing. Can be fixed by: `apt install libaio-dev`. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -ninja .................. [OKAY] --------------------------------------------------- - [WARNING]  async_io requires the libraries: ['libaio-dev'] but are missing. Can be fixed by: `apt install libaio-dev`. -op name ................ installed .. compatible --------------------------------------------------- -async_io ............... [NO] ....... [NO] -cpu_adam ............... [YES] ...... [OKAY] - [WARNING]  async_io requires the libraries: ['libaio-dev'] but are missing. Can be fixed by: `apt install libaio-dev`.transformer_inference .. -[NO] ....... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY]async_io -fused_lamb ............. [NO] ....... [OKAY] - ............... [NO] quantizer....... ..............[NO] -[NO] ....... [OKAY] --------------------------------------------------- -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -transformer_inference .. [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** - [WARNING]  async_io requires the libraries: ['libaio-dev'] but are missing. Can be fixed by: `apt install libaio-dev`. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io requires the libraries: ['libaio-dev'] but are missing. Can be fixed by: `apt install libaio-dev`. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io requires the libraries: ['libaio-dev'] but are missing. Can be fixed by: `apt install libaio-dev`. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed general environment info: -torch install path ............... DeepSpeed general environment info: -['/gpfswork/rech/six/commun/conda/tr1-13B/lib/python3.8/site-packages/torch'] -torch install path ...............torch version .................... 1.8.1 -torch cuda version['/gpfswork/rech/six/commun/conda/tr1-13B/lib/python3.8/site-packages/torch'] -............... 11.1 -torch versionnvcc version ......................................... 1.8.111.2 - -deepspeed install pathtorch cuda version .......................... 11.1['/gpfsssd/worksf/projects/rech/six/commun/code/tr1-13B/DeepSpeed-big-science/deepspeed'] - -nvcc versiondeepspeed info ........................................ 11.2 -0.4.2+bc17042, bc17042, big-sciencedeepspeed install path - deepspeed wheel compiled w............ ...... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr1-13B/DeepSpeed-big-science/deepspeed']torch 1.8, cuda 11.1 - -deepspeed info ................... 0.4.2+bc17042, bc17042, big-science -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/tr1-13B/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr1-13B/DeepSpeed-big-science/deepspeed'] -deepspeed info ................... 0.4.2+bc17042, bc17042, big-science -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1DeepSpeed general environment info: - -torch install path ............... ['/gpfswork/rech/six/commun/conda/tr1-13B/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr1-13B/DeepSpeed-big-science/deepspeed'] -deepspeed info ................... 0.4.2+bc17042, bc17042, big-science -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja -/bin/sh: line 0: type: git: not found -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/tr1-13B/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr1-13B/DeepSpeed-big-science/deepspeed'] -deepspeed info ................... 0.4.2+bc17042, bc17042, big-science -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [YES] ...... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/tr1-13B/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr1-13B/DeepSpeed-big-science/deepspeed'] -deepspeed info ................... 0.4.2+bc17042, bc17042, big-science -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io requires the libraries: ['libaio-dev'] but are missing. Can be fixed by: `apt install libaio-dev`. - [WARNING]  async_io requires the libraries: ['libaio-dev'] but are missing. Can be fixed by: `apt install libaio-dev`. -async_io ............... [NO]async_io ....... ...............[NO] -[NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -transformer_inference .. [NO] .......utils [OKAY].................. - [YES] ...... [OKAY] -utils ..................quantizer [YES].............. ......[NO] [OKAY] -....... [OKAY] -quantizer ..............-------------------------------------------------- -[NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/tr1-13B/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr1-13B/DeepSpeed-big-science/deepspeed'] -deepspeed info ................... 0.4.2+bc17042, bc17042, big-science -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info:DeepSpeed general environment info: - -torch install pathtorch install path .............................. ['/gpfswork/rech/six/commun/conda/tr1-13B/lib/python3.8/site-packages/torch']['/gpfswork/rech/six/commun/conda/tr1-13B/lib/python3.8/site-packages/torch'] - -torch versiontorch version ........................................ 1.8.11.8.1 - -torch cuda versiontorch cuda version .............................. 11.1 -11.1 -nvcc version nvcc version..................... .....................11.2 -11.2deepspeed install path - deepspeed install path........... ...........['/gpfsssd/worksf/projects/rech/six/commun/code/tr1-13B/DeepSpeed-big-science/deepspeed'] -['/gpfsssd/worksf/projects/rech/six/commun/code/tr1-13B/DeepSpeed-big-science/deepspeed']deepspeed info - deepspeed info................... ...................0.4.2+bc17042, bc17042, big-science -0.4.2+bc17042, bc17042, big-sciencedeepspeed wheel compiled w. - deepspeed wheel compiled w....... ......torch 1.8, cuda 11.1 -torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/tr1-13B/lib/python3.8/site-packages/torch'] -DeepSpeed general environment info: -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -torch install path ............... ['/gpfswork/rech/six/commun/conda/tr1-13B/lib/python3.8/site-packages/torch'] -nvcc version ..................... 11.2 -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr1-13B/DeepSpeed-big-science/deepspeed'] -deepspeed info ................... 0.4.2+bc17042, bc17042, big-science -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr1-13B/DeepSpeed-big-science/deepspeed'] -deepspeed info ................... 0.4.2+bc17042, bc17042, big-science -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [YES] ...... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] -DeepSpeed general environment info: -torch install path ............... DeepSpeed general environment info: -['/gpfswork/rech/six/commun/conda/tr1-13B/lib/python3.8/site-packages/torch'] -torch versiontorch install path .................... ...............1.8.1 -torch cuda version ............... 11.1['/gpfswork/rech/six/commun/conda/tr1-13B/lib/python3.8/site-packages/torch'] - -nvcc version .....................torch version 11.2.................... - deepspeed install path1.8.1 -........... torch cuda version ['/gpfsssd/worksf/projects/rech/six/commun/code/tr1-13B/DeepSpeed-big-science/deepspeed']............... - deepspeed info11.1 -...................nvcc version 0.4.2+bc17042, bc17042, big-science..................... - deepspeed wheel compiled w.11.2 -......deepspeed install path torch 1.8, cuda 11.1........... - ['/gpfsssd/worksf/projects/rech/six/commun/code/tr1-13B/DeepSpeed-big-science/deepspeed'] -deepspeed info ................... 0.4.2+bc17042, bc17042, big-science -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [YES] ...... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/tr1-13B/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr1-13B/DeepSpeed-big-science/deepspeed'] -deepspeed info ................... 0.4.2+bc17042, bc17042, big-science -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/tr1-13B/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr1-13B/DeepSpeed-big-science/deepspeed'] -deepspeed info ................... 0.4.2+bc17042, bc17042, big-science -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [YES] ...... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] -DeepSpeed general environment info:DeepSpeed general environment info: - -torch install pathtorch install path .............................. ['/gpfswork/rech/six/commun/conda/tr1-13B/lib/python3.8/site-packages/torch']['/gpfswork/rech/six/commun/conda/tr1-13B/lib/python3.8/site-packages/torch'] - -torch versiontorch version ........................................ 1.8.11.8.1 - -torch cuda versiontorch cuda version .............................. 11.111.1 - -nvcc versionnvcc version .......................................... 11.211.2 - -deepspeed install pathdeepspeed install path ...................... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr1-13B/DeepSpeed-big-science/deepspeed']['/gpfsssd/worksf/projects/rech/six/commun/code/tr1-13B/DeepSpeed-big-science/deepspeed'] - -deepspeed infodeepspeed info ...................................... 0.4.2+bc17042, bc17042, big-science0.4.2+bc17042, bc17042, big-science - -deepspeed wheel compiled w.deepspeed wheel compiled w. ............ torch 1.8, cuda 11.1torch 1.8, cuda 11.1 - - [WARNING]  async_io requires the libraries: ['libaio-dev'] but are missing. Can be fixed by: `apt install libaio-dev`. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io requires the libraries: ['libaio-dev'] but are missing. Can be fixed by: `apt install libaio-dev`. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/tr1-13B/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr1-13B/DeepSpeed-big-science/deepspeed'] -deepspeed info ................... 0.4.2+bc17042, bc17042, big-science -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja -ninja .................. [OKAY] -ninja .................. [OKAY] --------------------------------------------------- --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [YES] ...... [OKAY] -cpu_adam ............... [YES] ...... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/tr1-13B/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr1-13B/DeepSpeed-big-science/deepspeed'] -deepspeed info ................... 0.4.2+bc17042, bc17042, big-science -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/tr1-13B/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr1-13B/DeepSpeed-big-science/deepspeed'] -deepspeed info ................... 0.4.2+bc17042, bc17042, big-science -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io requires the libraries: ['libaio-dev'] but are missing. Can be fixed by: `apt install libaio-dev`. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/tr1-13B/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr1-13B/DeepSpeed-big-science/deepspeed'] -deepspeed info ................... 0.4.2+bc17042, bc17042, big-science -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/tr1-13B/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr1-13B/DeepSpeed-big-science/deepspeed'] -deepspeed info ................... 0.4.2+bc17042, bc17042, big-science -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/tr1-13B/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr1-13B/DeepSpeed-big-science/deepspeed'] -deepspeed info ................... 0.4.2+bc17042, bc17042, big-science -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja - [WARNING]  async_io requires the libraries: ['libaio-dev'] but are missing. Can be fixed by: `apt install libaio-dev`. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [YES] ...... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/tr1-13B/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr1-13B/DeepSpeed-big-science/deepspeed'] -deepspeed info ................... 0.4.2+bc17042, bc17042, big-science -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io requires the libraries: ['libaio-dev'] but are missing. Can be fixed by: `apt install libaio-dev`. -async_io ............... [NO] ....... [NO] - [WARNING]  async_io requires the libraries: ['libaio-dev'] but are missing. Can be fixed by: `apt install libaio-dev`. -transformer_inference .. [NO] ....... [OKAY] -async_io ............... utils[NO] ......................... [YES][NO] -...... [OKAY] -quantizer .............. [NO] .......transformer_inference [OKAY].. - [NO] .......-------------------------------------------------- -[OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja - [WARNING]  async_io requires the libraries: ['libaio-dev'] but are missing. Can be fixed by: `apt install libaio-dev`. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -ninja .................. [OKAY] --------------------------------------------------- -DeepSpeed general environment info: -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [YES] ...... [OKAY] -torch install path ............... ['/gpfswork/rech/six/commun/conda/tr1-13B/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr1-13B/DeepSpeed-big-science/deepspeed'] -fused_adam ............. [NO] ....... [OKAY] -deepspeed info ................... 0.4.2+bc17042, bc17042, big-science -fused_lamb ............. [NO] ....... [OKAY] -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info:DeepSpeed general environment info: - -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -torch install pathtorch install path .............................. ['/gpfswork/rech/six/commun/conda/tr1-13B/lib/python3.8/site-packages/torch']['/gpfswork/rech/six/commun/conda/tr1-13B/lib/python3.8/site-packages/torch'] - -torch versiontorch version ........................................ 1.8.11.8.1 - -stochastic_transformer . [NO] ....... [OKAY] -torch cuda versiontorch cuda version .............................. 11.111.1 - -nvcc versionnvcc version .......................................... 11.211.2 - -deepspeed install pathdeepspeed install path ...................... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr1-13B/DeepSpeed-big-science/deepspeed']['/gpfsssd/worksf/projects/rech/six/commun/code/tr1-13B/DeepSpeed-big-science/deepspeed'] - -deepspeed infodeepspeed info ...................................... 0.4.2+bc17042, bc17042, big-science0.4.2+bc17042, bc17042, big-science - -deepspeed wheel compiled w.deepspeed wheel compiled w. ............ torch 1.8, cuda 11.1torch 1.8, cuda 11.1 - --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja - [WARNING]  async_io requires the libraries: ['libaio-dev'] but are missing. Can be fixed by: `apt install libaio-dev`. [WARNING]  async_io requires the libraries: ['libaio-dev'] but are missing. Can be fixed by: `apt install libaio-dev`. - -async_ioasync_io .............................. [NO][NO] .............. [NO][NO] - -transformer_inferencetransformer_inference .... [NO][NO] .............. [OKAY][OKAY] - -utilsutils .................................... [YES][YES] ............ [OKAY][OKAY] - -quantizer quantizer.............. ..............[NO] [NO]....... .......[OKAY] -[OKAY] --------------------------------------------------- --------------------------------------------------- -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [YES] ...... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] -DeepSpeed general environment info:DeepSpeed general environment info: - -torch install pathtorch install path .............................. ['/gpfswork/rech/six/commun/conda/tr1-13B/lib/python3.8/site-packages/torch']['/gpfswork/rech/six/commun/conda/tr1-13B/lib/python3.8/site-packages/torch'] - -torch versiontorch version ........................................ 1.8.11.8.1 - -torch cuda versiontorch cuda version .............................. 11.111.1 - -nvcc versionnvcc version .......................................... 11.211.2 - -deepspeed install pathdeepspeed install path ...................... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr1-13B/DeepSpeed-big-science/deepspeed']['/gpfsssd/worksf/projects/rech/six/commun/code/tr1-13B/DeepSpeed-big-science/deepspeed'] - -deepspeed infodeepspeed info ...................................... 0.4.2+bc17042, bc17042, big-science0.4.2+bc17042, bc17042, big-science - -deepspeed wheel compiled w.deepspeed wheel compiled w. ............ torch 1.8, cuda 11.1torch 1.8, cuda 11.1 - - [WARNING]  async_io requires the libraries: ['libaio-dev'] but are missing. Can be fixed by: `apt install libaio-dev`. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -/bin/sh: line 0: type: git: not found - [WARNING]  async_io requires the libraries: ['libaio-dev'] but are missing. Can be fixed by: `apt install libaio-dev`. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/tr1-13B/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr1-13B/DeepSpeed-big-science/deepspeed'] -deepspeed info ................... 0.4.2+bc17042, bc17042, big-science -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [YES] ...... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/tr1-13B/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr1-13B/DeepSpeed-big-science/deepspeed'] -deepspeed info ................... 0.4.2+bc17042, bc17042, big-science -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/tr1-13B/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr1-13B/DeepSpeed-big-science/deepspeed'] -deepspeed info ................... 0.4.2+bc17042, bc17042, big-science -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/tr1-13B/lib/python3.8/site-packages/torch'] -/bin/sh: line 0: type: git: not found -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr1-13B/DeepSpeed-big-science/deepspeed'] -deepspeed info ................... 0.4.2+bc17042, bc17042, big-science -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -DeepSpeed general environment info:DeepSpeed general environment info: - -torch install path ...............torch install path ............... ['/gpfswork/rech/six/commun/conda/tr1-13B/lib/python3.8/site-packages/torch'] -['/gpfswork/rech/six/commun/conda/tr1-13B/lib/python3.8/site-packages/torch'] -torch version .................... torch version1.8.1 -.................... torch cuda version1.8.1 -............... 11.1torch cuda version - nvcc version............... .....................11.1 - 11.2nvcc version - deepspeed install path..................... ...........11.2 -deepspeed install path['/gpfsssd/worksf/projects/rech/six/commun/code/tr1-13B/DeepSpeed-big-science/deepspeed'] -...........deepspeed info ...................['/gpfsssd/worksf/projects/rech/six/commun/code/tr1-13B/DeepSpeed-big-science/deepspeed'] -0.4.2+bc17042, bc17042, big-sciencedeepspeed info - deepspeed wheel compiled w.................... ......0.4.2+bc17042, bc17042, big-science -torch 1.8, cuda 11.1 -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -/bin/sh: line 0: type: git: not found -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** - [WARNING]  async_io requires the libraries: ['libaio-dev'] but are missing. Can be fixed by: `apt install libaio-dev`. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [YES] ...... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/tr1-13B/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr1-13B/DeepSpeed-big-science/deepspeed'] -deepspeed info ................... 0.4.2+bc17042, bc17042, big-science -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/tr1-13B/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr1-13B/DeepSpeed-big-science/deepspeed'] -deepspeed info ................... 0.4.2+bc17042, bc17042, big-science -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** - [WARNING]  async_io requires the libraries: ['libaio-dev'] but are missing. Can be fixed by: `apt install libaio-dev`. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io requires the libraries: ['libaio-dev'] but are missing. Can be fixed by: `apt install libaio-dev`. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -/bin/sh: line 0: type: git: not found -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -/bin/sh: line 0: type: git: not found -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown ******** Git info for Megatron: git_hash=unknown git_branch=unknown **** - -/bin/sh: line 0: type: git: not found -/bin/sh: line 0: type: git: not found - [WARNING]  async_io requires the libraries: ['libaio-dev'] but are missing. Can be fixed by: `apt install libaio-dev`. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** - [WARNING]  async_io requires the libraries: ['libaio-dev'] but are missing. Can be fixed by: `apt install libaio-dev`. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io requires the libraries: ['libaio-dev'] but are missing. Can be fixed by: `apt install libaio-dev`. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -/bin/sh: line 0: type: git: not found -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -/bin/sh: line 0: type: git: not found -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -DeepSpeed general environment info: -DeepSpeed general environment info:torch install path - ............... torch install path ............... ['/gpfswork/rech/six/commun/conda/tr1-13B/lib/python3.8/site-packages/torch'] -torch version ['/gpfswork/rech/six/commun/conda/tr1-13B/lib/python3.8/site-packages/torch'].................... - 1.8.1 -torch version ....................torch cuda version 1.8.1............... - 11.1torch cuda version - nvcc version............... .....................11.1 -11.2nvcc version - deepspeed install path..................... ...........11.2 -deepspeed install path['/gpfsssd/worksf/projects/rech/six/commun/code/tr1-13B/DeepSpeed-big-science/deepspeed'] -........... deepspeed info ['/gpfsssd/worksf/projects/rech/six/commun/code/tr1-13B/DeepSpeed-big-science/deepspeed']................... - 0.4.2+bc17042, bc17042, big-sciencedeepspeed info - ...................deepspeed wheel compiled w. 0.4.2+bc17042, bc17042, big-science...... - torch 1.8, cuda 11.1deepspeed wheel compiled w. - ...... torch 1.8, cuda 11.1 - [WARNING]  async_io requires the libraries: ['libaio-dev'] but are missing. Can be fixed by: `apt install libaio-dev`. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -/bin/sh: line 0: type: git: not found -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown ******** Git info for Megatron: git_hash=unknown git_branch=unknown **** - -DeepSpeed general environment info: -torch install path ............... DeepSpeed general environment info: -['/gpfswork/rech/six/commun/conda/tr1-13B/lib/python3.8/site-packages/torch']torch install path - ...............torch version .................... 1.8.1 -['/gpfswork/rech/six/commun/conda/tr1-13B/lib/python3.8/site-packages/torch']torch cuda version - ...............torch version 11.1.................... - nvcc version1.8.1 -..................... 11.2torch cuda version - ...............deepspeed install path ...........11.1 -nvcc version['/gpfsssd/worksf/projects/rech/six/commun/code/tr1-13B/DeepSpeed-big-science/deepspeed'] -..................... deepspeed info11.2 -...................deepspeed install path 0.4.2+bc17042, bc17042, big-science........... - deepspeed wheel compiled w.['/gpfsssd/worksf/projects/rech/six/commun/code/tr1-13B/DeepSpeed-big-science/deepspeed'] -...... deepspeed infotorch 1.8, cuda 11.1 -................... 0.4.2+bc17042, bc17042, big-science -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info:DeepSpeed general environment info: - -torch install pathtorch install path ............... ............... ['/gpfswork/rech/six/commun/conda/tr1-13B/lib/python3.8/site-packages/torch']['/gpfswork/rech/six/commun/conda/tr1-13B/lib/python3.8/site-packages/torch'] - -torch versiontorch version ........................................ 1.8.11.8.1DeepSpeed general environment info: - -torch cuda versiontorch cuda version - .............................. 11.111.1 - -torch install pathnvcc versionnvcc version ......................................................... 11.211.2 - -deepspeed install pathdeepspeed install path ...................... ['/gpfswork/rech/six/commun/conda/tr1-13B/lib/python3.8/site-packages/torch'] -['/gpfsssd/worksf/projects/rech/six/commun/code/tr1-13B/DeepSpeed-big-science/deepspeed']['/gpfsssd/worksf/projects/rech/six/commun/code/tr1-13B/DeepSpeed-big-science/deepspeed'] - -deepspeed infotorch version deepspeed info .................... ................... ................... 1.8.1 0.4.2+bc17042, bc17042, big-science -0.4.2+bc17042, bc17042, big-science - -deepspeed wheel compiled w.torch cuda versiondeepspeed wheel compiled w. ........................... torch 1.8, cuda 11.111.1torch 1.8, cuda 11.1 - - -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr1-13B/DeepSpeed-big-science/deepspeed'] -deepspeed info ................... 0.4.2+bc17042, bc17042, big-science -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -/bin/sh: line 0: type: git: not found -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja -/bin/sh: line 0: type: git: not found -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown ******** Git info for Megatron: git_hash=unknown git_branch=unknown **** - -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [YES] ...... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -DeepSpeed general environment info:DeepSpeed general environment info: - -torch install pathtorch install path .............................. ['/gpfswork/rech/six/commun/conda/tr1-13B/lib/python3.8/site-packages/torch']['/gpfswork/rech/six/commun/conda/tr1-13B/lib/python3.8/site-packages/torch'] - -torch versiontorch version ........................................ 1.8.11.8.1 - -torch cuda versiontorch cuda version .............................. 11.111.1 - -nvcc versionnvcc version .......................................... 11.211.2 - -deepspeed install pathdeepspeed install path ...................... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr1-13B/DeepSpeed-big-science/deepspeed']['/gpfsssd/worksf/projects/rech/six/commun/code/tr1-13B/DeepSpeed-big-science/deepspeed'] - -deepspeed infodeepspeed info ...................................... 0.4.2+bc17042, bc17042, big-science0.4.2+bc17042, bc17042, big-science - -deepspeed wheel compiled w.deepspeed wheel compiled w. ............ torch 1.8, cuda 11.1torch 1.8, cuda 11.1 - -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/tr1-13B/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr1-13B/DeepSpeed-big-science/deepspeed'] -deepspeed info ................... 0.4.2+bc17042, bc17042, big-science -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io requires the libraries: ['libaio-dev'] but are missing. Can be fixed by: `apt install libaio-dev`. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io requires the libraries: ['libaio-dev'] but are missing. Can be fixed by: `apt install libaio-dev`. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/tr1-13B/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr1-13B/DeepSpeed-big-science/deepspeed'] -deepspeed info ................... 0.4.2+bc17042, bc17042, big-science -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -/bin/sh: line 0: type: git: not found -/bin/sh: line 0: type: git: not found -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja -/bin/sh: line 0: type: git: not found -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [YES] ...... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/tr1-13B/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr1-13B/DeepSpeed-big-science/deepspeed'] -deepspeed info ................... 0.4.2+bc17042, bc17042, big-science -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [YES] ...... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [YES] ...... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] - [WARNING]  async_io requires the libraries: ['libaio-dev'] but are missing. Can be fixed by: `apt install libaio-dev`. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -/bin/sh: line 0: type: git: not found -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** - [WARNING]  async_io requires the libraries: ['libaio-dev'] but are missing. Can be fixed by: `apt install libaio-dev`. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -/bin/sh: line 0: type: git: not found -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -/bin/sh: line 0: type: git: not found -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/tr1-13B/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr1-13B/DeepSpeed-big-science/deepspeed'] -deepspeed info ................... 0.4.2+bc17042, bc17042, big-science -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/tr1-13B/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr1-13B/DeepSpeed-big-science/deepspeed'] -deepspeed info ................... 0.4.2+bc17042, bc17042, big-science -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info:DeepSpeed general environment info: - -torch install pathtorch install path .............................. ['/gpfswork/rech/six/commun/conda/tr1-13B/lib/python3.8/site-packages/torch']['/gpfswork/rech/six/commun/conda/tr1-13B/lib/python3.8/site-packages/torch'] - -torch versiontorch version ........................................ 1.8.11.8.1 - -torch cuda versiontorch cuda version .............................. 11.111.1 - -nvcc versionnvcc version .......................................... 11.211.2 - -deepspeed install pathdeepspeed install path ...................... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr1-13B/DeepSpeed-big-science/deepspeed']['/gpfsssd/worksf/projects/rech/six/commun/code/tr1-13B/DeepSpeed-big-science/deepspeed'] - -deepspeed infodeepspeed info ...................................... 0.4.2+bc17042, bc17042, big-science0.4.2+bc17042, bc17042, big-science - -deepspeed wheel compiled w.deepspeed wheel compiled w. ............ torch 1.8, cuda 11.1torch 1.8, cuda 11.1 - -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** - [WARNING]  async_io requires the libraries: ['libaio-dev'] but are missing. Can be fixed by: `apt install libaio-dev`. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -/bin/sh: line 0: type: git: not found -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -/bin/sh: line 0: type: git: not found -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown ******** Git info for Megatron: git_hash=unknown git_branch=unknown **** - -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/tr1-13B/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr1-13B/DeepSpeed-big-science/deepspeed'] -deepspeed info ................... 0.4.2+bc17042, bc17042, big-science -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/tr1-13B/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr1-13B/DeepSpeed-big-science/deepspeed'] -deepspeed info ................... 0.4.2+bc17042, bc17042, big-science -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/tr1-13B/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr1-13B/DeepSpeed-big-science/deepspeed'] -deepspeed info ................... 0.4.2+bc17042, bc17042, big-science -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io requires the libraries: ['libaio-dev'] but are missing. Can be fixed by: `apt install libaio-dev`. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/tr1-13B/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr1-13B/DeepSpeed-big-science/deepspeed'] -deepspeed info ................... 0.4.2+bc17042, bc17042, big-science -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/tr1-13B/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr1-13B/DeepSpeed-big-science/deepspeed'] -deepspeed info ................... 0.4.2+bc17042, bc17042, big-science -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io requires the libraries: ['libaio-dev'] but are missing. Can be fixed by: `apt install libaio-dev`. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io requires the libraries: ['libaio-dev'] but are missing. Can be fixed by: `apt install libaio-dev`. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/tr1-13B/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr1-13B/DeepSpeed-big-science/deepspeed'] -deepspeed info ................... 0.4.2+bc17042, bc17042, big-science -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -/bin/sh: line 0: type: git: not found -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/tr1-13B/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr1-13B/DeepSpeed-big-science/deepspeed'] -deepspeed info ................... 0.4.2+bc17042, bc17042, big-science -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -/bin/sh: line 0: type: git: not found -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/tr1-13B/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr1-13B/DeepSpeed-big-science/deepspeed'] -deepspeed info ................... 0.4.2+bc17042, bc17042, big-science -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/tr1-13B/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr1-13B/DeepSpeed-big-science/deepspeed'] -deepspeed info ................... 0.4.2+bc17042, bc17042, big-science -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -/bin/sh: line 0: type: git: not found -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/tr1-13B/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -/bin/sh: line 0: type: git: not found -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr1-13B/DeepSpeed-big-science/deepspeed'] -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -deepspeed info ................... 0.4.2+bc17042, bc17042, big-science -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [YES] ...... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [YES] ...... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attnninja ............ [NO].................. .......[OKAY] -[OKAY] --------------------------------------------------- -op nametransformer ............................ installed[NO] ......... compatible -[OKAY] --------------------------------------------------- -stochastic_transformer . [NO] cpu_adam....... ...............[OKAY] -[YES] ...... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] --------------------------------------------------....... -[OKAY] -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [YES] ...... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] - [WARNING]  async_io requires the libraries: ['libaio-dev'] but are missing. Can be fixed by: `apt install libaio-dev`. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** - [WARNING]  async_io requires the libraries: ['libaio-dev'] but are missing. Can be fixed by: `apt install libaio-dev`. - [WARNING]  async_io requires the libraries: ['libaio-dev'] but are missing. Can be fixed by: `apt install libaio-dev`. -async_io ............... [NO] ....... [NO] -async_io ............... [NO] transformer_inference....... ..[NO] -[NO] ....... [OKAY] -utils ..................transformer_inference [YES].. ......[NO] [OKAY]....... - [OKAY] -quantizer .............. [NO] .......utils [OKAY].................. - [YES] ...... --------------------------------------------------[OKAY] - -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io requires the libraries: ['libaio-dev'] but are missing. Can be fixed by: `apt install libaio-dev`. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/tr1-13B/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr1-13B/DeepSpeed-big-science/deepspeed'] -deepspeed info ................... 0.4.2+bc17042, bc17042, big-science -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/tr1-13B/lib/python3.8/site-packages/torch'] -DeepSpeed general environment info:torch version .................... 1.8.1 - -torch cuda version ...............torch install path 11.1............... - nvcc version ..................... 11.2 -deepspeed install path['/gpfswork/rech/six/commun/conda/tr1-13B/lib/python3.8/site-packages/torch'] -........... torch version['/gpfsssd/worksf/projects/rech/six/commun/code/tr1-13B/DeepSpeed-big-science/deepspeed'] - ....................deepspeed info 1.8.1................... - 0.4.2+bc17042, bc17042, big-science -torch cuda version deepspeed wheel compiled w................ ......11.1 -torch 1.8, cuda 11.1nvcc version - ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr1-13B/DeepSpeed-big-science/deepspeed'] -deepspeed info ................... 0.4.2+bc17042, bc17042, big-science -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/tr1-13B/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr1-13B/DeepSpeed-big-science/deepspeed'] -deepspeed info ................... 0.4.2+bc17042, bc17042, big-science -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -/bin/sh: line 0: type: git: not found -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown ******** Git info for Megatron: git_hash=unknown git_branch=unknown **** - -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** ------------------------------------------------------------------------------------------------------------------------------------------------------- - --------------------------------------------------- -DeepSpeed C++/CUDA extension op report -DeepSpeed C++/CUDA extension op reportDeepSpeed C++/CUDA extension op report -DeepSpeed C++/CUDA extension op report - - ------------------------------------------------------------------------------------------------------------------------------------------------------- - - -DeepSpeed C++/CUDA extension op report-------------------------------------------------- -DeepSpeed C++/CUDA extension op reportDeepSpeed C++/CUDA extension op report - - ----------------------------------------------------------------------------------------------------- ----------------------------------------------------------------------------------------------------- -DeepSpeed C++/CUDA extension op report - -DeepSpeed C++/CUDA extension op report -DeepSpeed C++/CUDA extension op report ---------------------------------------------------DeepSpeed C++/CUDA extension op report - - -----------------------------------------------------------------------------------------------------NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.NOTE: Ops not installed will be just-in-time (JIT) compiled at ------------------------------------------------------------------------------------------------------------------------------------------------------- --------------------------------------------------- - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- - ---------------------------------------------------DeepSpeed C++/CUDA extension op report-------------------------------------------------- --------------------------------------------------- - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.-------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- - - -----------------------------------------------------------------------------------------------------NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- --------------------------------------------------- --------------------------------------------------- --------------------------------------------------- -JIT compiled ops requires ninjaJIT compiled ops requires ninja - - -JIT compiled ops requires ninjaJIT compiled ops requires ninja - - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - -JIT compiled ops requires ninjaJIT compiled ops requires ninja - --------------------------------------------------- -JIT compiled ops requires ninja - -JIT compiled ops requires ninja --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - - ---------------------------------------------------JIT compiled ops requires ninjaJIT compiled ops requires ninja-------------------------------------------------- - - - -JIT compiled ops requires ninjaJIT compiled ops requires ninja - ------------------------------------------------------------------------------------------------------------------------------------------------------- - - ---------------------------------------------------DeepSpeed C++/CUDA extension op reportDeepSpeed C++/CUDA extension op reportDeepSpeed C++/CUDA extension op report - - - ---------------------------------------------------DeepSpeed C++/CUDA extension op report-------------------------------------------------- - --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. ---------------------------------------------------NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.--------------------------------------------------NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - - - ---------------------------------------------------JIT compiled ops requires ninja---------------------------------------------------------------------------------------------------- - - - -JIT compiled ops requires ninjaJIT compiled ops requires ninjaJIT compiled ops requires ninja - - ----------------------------------------------------------------------------------------------------- - -DeepSpeed C++/CUDA extension op reportDeepSpeed C++/CUDA extension op report - ----------------------------------------------------------------------------------------------------- - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.-------------------------------------------------- - ----------------------------------------------------------------------------------------------------- - - ---------------------------------------------------JIT compiled ops requires ninjaDeepSpeed C++/CUDA extension op reportJIT compiled ops requires ninja - - - -DeepSpeed C++/CUDA extension op report-------------------------------------------------- - ---------------------------------------------------NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.-------------------------------------------------- - ---------------------------------------------------JIT compiled ops requires ninja - -JIT compiled ops requires ninja -ninjaninjaninjaninja .................................... ..................[OKAY] .................. -ninjaninjaninja ninja.................................... .................. [OKAY][OKAY] .................. - -[OKAY][OKAY] -------------------------------------------------- - - [OKAY][OKAY]-------------------------------------------------- - --------------------------------------------------- - -----------------------------------------------------------------------------------------------------op name - -ninjaninjaninjaninja ........................................................................ [OKAY][OKAY][OKAY][OKAY] - - - -[OKAY] ----------------------------------------------------------------------------------------------------- - -op name -op name-------------------------------------------------- op name - op nameop nameop name................ ................................................installed installedinstalled..installed ....compatible.. - compatiblecompatiblecompatible-------------------------------------------------- - - - ------------------------------------------------------------------------------------------------------------------------------------------------------- - - --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- - - - -................................ ................op nameinstalledinstalled ..installed.. ................ compatible..compatible - - installed--------------------------------------------------compatible-------------------------------------------------- - -cpu_adam ............... cpu_adamcpu_adam [YES]cpu_adam ............... ............... ..................... [YES] [YES] [YES] [OKAY]...... ...... - ...... [OKAY] [OKAY] -[OKAY] - -op nameop name op nameop name................................ ................installedinstalled................ installed..installed.. ....compatiblecompatible - -compatiblecompatible-------------------------------------------------- --------------------------------------------------- - - ----------------------------------------------------------------------------------------------------- - - -..-------------------------------------------------- -compatible --------------------------------------------------- -fused_adam .............fused_adam fused_adam [NO] ............. fused_adam............. ....... [NO] [NO].............[OKAY] -..............[NO] [OKAY][OKAY].......fused_lamb - -cpu_adamcpu_adam cpu_adamcpu_adam ............... ............... ..............................[YES] [YES] [YES] [YES]...... ..................[OKAY] -cpu_adamcpu_adam ...............cpu_adam............... [YES]cpu_adam............... [YES] [YES]............ ............... ...... [OKAY][OKAY] -[YES][OKAY] - [OKAY]............. - [OKAY][OKAY][OKAY] - - - -...... [OKAY] - fused_lambfused_lamb[NO] ................................. fused_lamb [NO][NO] [OKAY]........................... - [NO][OKAY][OKAY] - -fused_adamfused_adam fused_adamfused_adam ............. ............. .......................... [NO] [NO] [NO] ....... [NO]....... ....... [OKAY] -....... [OKAY] -[OKAY][OKAY]....... - - fused_lamb[OKAY] fused_lamb -fused_adamfused_adam fused_adam.......................... fused_adam[NO] [NO]............. .................... ....... [NO][OKAY] [NO] -sparse_attn ............sparse_attn sparse_attn [NO] ........................sparse_attn....... ............[NO][NO][OKAY] -[NO].............. .......[OKAY][OKAY] - -transformer[OKAY] -.............fused_lamb [NO]..........................fused_lamb ....... [NO][NO] ............. ....... [NO][OKAY] -....... [OKAY][OKAY]....... - -.......[OKAY] fused_lamb -[OKAY]....... -fused_lamb............. [OKAY]............. -............transformer transformer............transformer[NO] ............[NO]................... [NO].......[NO][OKAY] - [OKAY] -fused_lamb[NO] .......[NO]............. [OKAY][NO] -.......[OKAY]....... - [OKAY][OKAY] - -sparse_attn ............ [NO] ....... [OKAY]sparse_attn -fused_lamb....... .................... [OKAY][OKAY] - -stochastic_transformer stochastic_transformer. stochastic_transformer stochastic_transformer [NO]. ........[NO] . [NO] [OKAY][NO]....... ....... -.......[OKAY] -[OKAY][OKAY] - - sparse_attn............sparse_attn transformer ............[NO]........................ .......[NO][NO] [NO] [OKAY]..................... -[NO] .......sparse_attn [OKAY]............ - [NO] .......sparse_attn [OKAY] - [OKAY][OKAY]transformer[OKAY] - - -sparse_attn............ transformer ............ [NO]............ [NO]sparse_attn .......[NO] ....... ............ [OKAY] ....... -[OKAY][NO] -............ transformer[NO] stochastic_transformertransformer................... [OKAY][NO]............. - [OKAY]transformer -....... ............transformer [OKAY][NO]............ stochastic_transformer - ....... [NO] [NO] [OKAY]stochastic_transformer -....... ....... [OKAY][OKAY]. - -....... [NO] [OKAY] transformer -stochastic_transformer [NO] .stochastic_transformer....... [NO][OKAY] . -....... [NO][OKAY] -. ....... ............[NO] stochastic_transformer [OKAY][NO] ....... -.[OKAY]....... - ....... [OKAY] -[NO] stochastic_transformer [OKAY] ....... - .[OKAY] -[NO]stochastic_transformer ....... [OKAY]. - [NO] ....... [OKAY] -ninjaninjaninjaninja ........................................................................ [OKAY][OKAY][OKAY][OKAY] - - - --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- - - - -op nameop name op name................op name ................................installed................ ..installed installedinstalledcompatible -.... ..-------------------------------------------------- compatible - -compatiblecompatible-------------------------------------------------- - - --------------------------------------------------- --------------------------------------------------- -cpu_adam cpu_adam............... cpu_adam...............cpu_adam[YES] [YES] ............... ............... ...... [YES][OKAY]...... -[YES]......[OKAY] - ...... [OKAY][OKAY] - -fused_adam ............. [NO] fused_adam....... .............[OKAY]fused_adam -fused_adam [NO] ............. fused_lamb ....... ............. .............[NO] [OKAY] [NO] -[NO]....... fused_lamb..............[OKAY] .............[OKAY] -[OKAY] -[NO] - fused_lamb....... .............fused_lamb [OKAY] [NO] - .................... sparse_attn [NO] [OKAY] -................... [NO][OKAY] -.......sparse_attn [OKAY]............ - [NO] ....... sparse_attntransformer[OKAY] - ........................transformer sparse_attn [NO] [NO]............................... [NO].......[OKAY] [NO] -.......[OKAY] -[OKAY].......stochastic_transformer -transformer [OKAY]............ -.stochastic_transformer [NO]transformer.[NO] ..........................[NO] [OKAY][OKAY][NO] -....... - [OKAY]....... - stochastic_transformer[OKAY] -. stochastic_transformer[NO] ....... .[OKAY] -[NO] ....... [OKAY] ----------------------------------------------------------------------------------------------------- - -DeepSpeed C++/CUDA extension op reportDeepSpeed C++/CUDA extension op report - ----------------------------------------------------------------------------------------------------- - ---------------------------------------------------NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - - ---------------------------------------------------DeepSpeed C++/CUDA extension op report---------------------------------------------------------------------------------------------------- - - - -JIT compiled ops requires ninjaDeepSpeed C++/CUDA extension op report-------------------------------------------------- -JIT compiled ops requires ninja - --------------------------------------------------- - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - ----------------------------------------------------------------------------------------------------- - -JIT compiled ops requires ninjaJIT compiled ops requires ninja - -ninjaninjaninjaninja ........................................................................ [OKAY][OKAY][OKAY] -[OKAY] - - --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- - - - -op nameop nameop name op name ................ ................ ................................ installed installed installed ..installed .. .. ..compatible compatible compatible - - -compatible------------------------------------------------------------------------------------------------------------------------------------------------------ - - - --------------------------------------------------- -cpu_adamcpu_adam cpu_adam ...............cpu_adam ..............................[YES] ............... [YES] ...... [YES][YES] ...... [OKAY]......[OKAY] - -...... [OKAY][OKAY] - -fused_adam fused_adam............. fused_adam[NO]fused_adam............. ....... .......................... [NO] [OKAY] [NO][NO] -....... .......fused_lamb[OKAY] ....... ............. - [OKAY] [OKAY] -[NO] -fused_lamb fused_lamb....... fused_lamb ............. .............[OKAY]............. - [NO][NO][NO] ..................... [OKAY][OKAY][OKAY] - - -sparse_attn ............ [NO] ....... [OKAY] -sparse_attntransformer sparse_attn ............sparse_attn ............ ............ [NO] [NO]............ .......[NO] .......[NO] ....... [OKAY][OKAY] ....... -[OKAY] - -[OKAY]stochastic_transformer - transformer.transformertransformer [NO]............ ............ [NO]............ ....... [NO] [NO]....... [OKAY] ....... -....... [OKAY] [OKAY] -[OKAY] - -stochastic_transformerstochastic_transformerstochastic_transformer ... [NO][NO][NO] ..................... [OKAY][OKAY][OKAY] - - ----------------------------------------------------------------------------------------------------- - ---------------------------------------------------DeepSpeed C++/CUDA extension op report -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -DeepSpeed C++/CUDA extension op report - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.-------------------------------------------------- ----------------------------------------------------------------------------------------------------- --------------------------------------------------- - - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.DeepSpeed C++/CUDA extension op reportNOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.JIT compiled ops requires ninja - - - ------------------------------------------------------------------------------------------------------------------------------------------------------- - - -JIT compiled ops requires ninjaJIT compiled ops requires ninjaNOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - - --------------------------------------------------- -JIT compiled ops requires ninja -ninjaninjaninjaninja .................. .................................... ..................[OKAY] - [OKAY][OKAY][OKAY] --------------------------------------------------- - - --------------------------------------------------- ---------------------------------------------------op name--------------------------------------------------op name - - ................op name................op name ................installed................installed ..installedinstalled.. compatiblecompatible.. -.. - -------------------------------------------------- --------------------------------------------------compatible -compatible - - ----------------------------------------------------------------------------------------------------- - -cpu_adam cpu_adam............... [YES]............... cpu_adamcpu_adam [YES]..................... [OKAY] ..................... [OKAY] - -[YES][YES] ............ [OKAY][OKAY] - -fused_adam fused_adam............. .............[NO] [NO]....... fused_adamfused_adam....... [OKAY] ............. -.............[OKAY] -[NO] fused_lamb [NO] fused_lamb.................... ....................[NO][OKAY] ....... - [NO][OKAY][OKAY] fused_lamb - -....... fused_lamb .............[OKAY] -.............[NO] [NO]....... .......[OKAY] -[OKAY]sparse_attn - ............ [NO] ....... sparse_attn[OKAY] -............ [NO]transformer ................... sparse_attnsparse_attn[OKAY] - [NO] ............transformer ............ .......[NO] ............ [OKAY][NO] ....... - [NO] [OKAY] .......stochastic_transformer - ....... [OKAY] transformer.[OKAY] - -............[NO]transformer stochastic_transformer[NO] ....... ............ ........ [OKAY] [NO][OKAY] -[NO] - .............. stochastic_transformer[OKAY][OKAY] - -.stochastic_transformer [NO] ........ [NO][OKAY] -....... [OKAY] --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- ---------------------------------------------------NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - ---------------------------------------------------DeepSpeed C++/CUDA extension op report - -JIT compiled ops requires ninja-------------------------------------------------- --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - -DeepSpeed C++/CUDA extension op report-------------------------------------------------- - --------------------------------------------------- -JIT compiled ops requires ninja--------------------------------------------------NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - - ---------------------------------------------------DeepSpeed C++/CUDA extension op report - -JIT compiled ops requires ninja --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja -ninjaninjaninja .................. .................. ninja ..................[OKAY][OKAY] - -[OKAY]..................---------------------------------------------------------------------------------------------------- - -[OKAY] --------------------------------------------------- -op nameop name - -------------------------------------------------- op name................ -................ op nameinstalled................installed ................installed.. .. installed compatiblecompatible.. -.. - --------------------------------------------------compatible--------------------------------------------------compatible - - - ----------------------------------------------------------------------------------------------------- - -cpu_adamcpu_adam ..............................cpu_adamcpu_adam [YES]...............[YES] ......[YES]............... ...... [OKAY] ...... -[YES] [OKAY] [OKAY] -...... - [OKAY] -fused_adam ............. fused_adamfused_adam[NO] fused_adam....... ............. ............. [OKAY] .............[NO][NO] - .......[NO].......fused_lamb [OKAY].................... - [OKAY] [NO] -[OKAY].......fused_lamb - [OKAY].............fused_lamb -fused_lamb [NO].......................... .......[NO][NO] [OKAY].............. -sparse_attn [OKAY][OKAY]............ - - [NO] ....... [OKAY] -sparse_attntransformer ........................ [NO]sparse_attn[NO] sparse_attn ................... ................... [OKAY] [NO][NO] -[OKAY] - .............. stochastic_transformer [OKAY] transformer[OKAY] - ............. - [NO]transformer[NO] ....... transformer............ ....... [OKAY][OKAY]............ -[NO] - [NO]....... stochastic_transformer.......[OKAY] -[OKAY]. - stochastic_transformer[NO] stochastic_transformer........ [OKAY][NO] -. .......[NO] [OKAY]....... - [OKAY] -ninjaninjaninja ninja .................................... .................. ..................[OKAY] [OKAY][OKAY][OKAY] - - - ------------------------------------------------------------------------------------------------------------------------------------------------------- --------------------------------------------------- - - -op nameop nameop name op name ................ ................................installed ................installed..installed installed ..compatible -compatible.... --------------------------------------------------- ---------------------------------------------------compatiblecompatible - - ----------------------------------------------------------------------------------------------------- - -cpu_adam ............... cpu_adam[YES] ...............cpu_adam......cpu_adam ...............[YES][OKAY]............... - ...... [YES] [YES][OKAY] - ............ [OKAY][OKAY] - -fused_adam ............. [NO] ....... [OKAY]fused_adam - fused_adam.............fused_adam fused_lamb ............. [NO] ............. .............[NO] ....... [NO] [NO] .......[OKAY].............. - [OKAY][OKAY][OKAY] - - -fused_lamb fused_lamb.............fused_lamb ............. [NO]............. [NO].......[NO] sparse_attn[OKAY]....... -....... ............ [OKAY] [OKAY] -[NO] - ....... [OKAY] -transformer ............ sparse_attn[NO] ................... sparse_attn[OKAY] -[NO]sparse_attn ............................... stochastic_transformer[OKAY] -[NO][NO] ...............transformer [NO][OKAY]............ [OKAY]....... - - [NO][OKAY]transformer transformer -................... ............[OKAY][NO] - [NO]....... .......[OKAY] -[OKAY]stochastic_transformer - .stochastic_transformerstochastic_transformer [NO] ......... [OKAY] [NO] -[NO] .............. [OKAY][OKAY] - ----------------------------------------------------------------------------------------------------- - -DeepSpeed C++/CUDA extension op reportDeepSpeed C++/CUDA extension op report --------------------------------------------------- ----------------------------------------------------------------------------------------------------- - - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.DeepSpeed C++/CUDA extension op reportNOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- - --------------------------------------------------- -----------------------------------------------------------------------------------------------------DeepSpeed C++/CUDA extension op report - - - -JIT compiled ops requires ninjaJIT compiled ops requires ninja-------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- ---------------------------------------------------JIT compiled ops requires ninja - -JIT compiled ops requires ninja -ninjaninjaninjaninja .................. ..................[OKAY].................. -.................. [OKAY]-------------------------------------------------- [OKAY] - -[OKAY] ---------------------------------------------------op name - --------------------------------------------------- -------------------------------------------------- -op name................ - op name................installed op name ................ ..installed ................installed compatible -..installed.. -------------------------------------------------- -compatible..compatible - -compatible---------------------------------------------------------------------------------------------------- - - --------------------------------------------------- -cpu_adam ............... [YES] ...... cpu_adam[OKAY]cpu_adam -cpu_adam.............................. ...............[YES][YES] ......[YES]...... [OKAY]......[OKAY] -fused_adam - [OKAY]............. - [NO] ....... [OKAY] -fused_adamfused_adamfused_lamb .......................................fused_adam [NO][NO].............[NO] ..................... [NO][OKAY] [OKAY][OKAY] - -....... - fused_lamb[OKAY] fused_lamb -............. .............[NO] fused_lamb [NO] ....... sparse_attn .................... [OKAY] ............[NO] - [OKAY][NO]....... - .......[OKAY] -[OKAY] -transformer ............sparse_attn sparse_attn[NO]............ .......sparse_attn............[NO] [OKAY] .......[NO]............ - [OKAY][NO]....... - stochastic_transformer.......[OKAY] - transformer[OKAY] transformer............. - ............[NO][NO]transformer [NO].......................... [OKAY][OKAY]....... -[NO] - [OKAY]....... - stochastic_transformer[OKAY] -.stochastic_transformer [NO]stochastic_transformer ........ .[NO][OKAY] -[NO]....... .......[OKAY] -[OKAY] - [WARNING]  async_io requires the libraries: ['libaio-dev'] but are missing. Can be fixed by: `apt install libaio-dev`. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io requires the libraries: ['libaio-dev'] but are missing. Can be fixed by: `apt install libaio-dev`. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io requires the libraries: ['libaio-dev'] but are missing. Can be fixed by: `apt install libaio-dev`. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io requires the libraries: ['libaio-dev'] but are missing. Can be fixed by: `apt install libaio-dev`. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io requires the libraries: ['libaio-dev'] but are missing. Can be fixed by: `apt install libaio-dev`. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io requires the libraries: ['libaio-dev'] but are missing. Can be fixed by: `apt install libaio-dev`. -async_io ............... [NO] ....... [NO] - [WARNING]  async_io requires the libraries: ['libaio-dev'] but are missing. Can be fixed by: `apt install libaio-dev`. -transformer_inference .. [NO] ....... [OKAY] -utils .................. async_io[YES] ..................... [NO][OKAY] -....... [NO] -quantizer .............. [NO] ....... [OKAY] ---------------------------------------------------transformer_inference - .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io requires the libraries: ['libaio-dev'] but are missing. Can be fixed by: `apt install libaio-dev`. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io requires the libraries: ['libaio-dev'] but are missing. Can be fixed by: `apt install libaio-dev`. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io requires the libraries: ['libaio-dev'] but are missing. Can be fixed by: `apt install libaio-dev`. -async_io ............... [NO] ....... [NO] - [WARNING]  async_io requires the libraries: ['libaio-dev'] but are missing. Can be fixed by: `apt install libaio-dev`. -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] async_io...... ...............[OKAY] -[NO] ....... [NO] -quantizer .............. [NO] ....... [OKAY] ---------------------------------------------------transformer_inference - .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io requires the libraries: ['libaio-dev'] but are missing. Can be fixed by: `apt install libaio-dev`. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io requires the libraries: ['libaio-dev'] but are missing. Can be fixed by: `apt install libaio-dev`. [WARNING]  async_io requires the libraries: ['libaio-dev'] but are missing. Can be fixed by: `apt install libaio-dev`. - -async_io ............... [NO]async_io ....... ...............[NO] -[NO] ....... [NO] -transformer_inference .. transformer_inference[NO] ......... [NO][OKAY] -....... [OKAY] -utils .................. utils[YES] ........................ [YES][OKAY] -...... [OKAY] -quantizer .............. [NO] quantizer....... ..............[OKAY] -[NO] ....... --------------------------------------------------[OKAY] - --------------------------------------------------- - [WARNING]  async_io requires the libraries: ['libaio-dev'] but are missing. Can be fixed by: `apt install libaio-dev`. - [WARNING]  async_io requires the libraries: ['libaio-dev'] but are missing. Can be fixed by: `apt install libaio-dev`. -async_io ............... [NO] ....... [NO] -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -transformer_inference ..utils [NO].................. .......[YES] [OKAY]...... - [OKAY] -utils quantizer.................. ..............[YES] [NO]...... .......[OKAY] -[OKAY] -quantizer .............. --------------------------------------------------[NO] - ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io requires the libraries: ['libaio-dev'] but are missing. Can be fixed by: `apt install libaio-dev`. [WARNING]  async_io requires the libraries: ['libaio-dev'] but are missing. Can be fixed by: `apt install libaio-dev`. - -async_io ...............async_io [NO] ...................... [NO][NO] -....... [NO] -transformer_inference .. transformer_inference[NO] ......... [NO][OKAY] -....... [OKAY] -utils ..................utils [YES].................. ......[YES] [OKAY]...... - [OKAY] -quantizer ..............quantizer ..............[NO] [NO]....... .......[OKAY] -[OKAY] --------------------------------------------------- --------------------------------------------------- - [WARNING]  async_io requires the libraries: ['libaio-dev'] but are missing. Can be fixed by: `apt install libaio-dev`. [WARNING]  async_io requires the libraries: ['libaio-dev'] but are missing. Can be fixed by: `apt install libaio-dev`. - -async_ioasync_io .............................. [NO][NO] .............. [NO][NO] - -transformer_inference transformer_inference.. ..[NO] [NO] .............. [OKAY][OKAY] - -utilsutils .................................... [YES][YES] ............ [OKAY][OKAY] - -quantizerquantizer ............................ [NO][NO] .............. [OKAY][OKAY] - --------------------------------------------------- --------------------------------------------------- - [WARNING]  async_io requires the libraries: ['libaio-dev'] but are missing. Can be fixed by: `apt install libaio-dev`. [WARNING]  async_io requires the libraries: ['libaio-dev'] but are missing. Can be fixed by: `apt install libaio-dev`. - -async_io ...............async_io [NO] ...................... [NO][NO] -....... [NO] -transformer_inference transformer_inference.. ..[NO] [NO]....... .......[OKAY] - [OKAY] -utils ..................utils [YES].................. ...... [YES][OKAY] -...... [OKAY] -quantizer quantizer.............. ..............[NO] [NO]....... .......[OKAY] -[OKAY] --------------------------------------------------- --------------------------------------------------- - [WARNING]  async_io requires the libraries: ['libaio-dev'] but are missing. Can be fixed by: `apt install libaio-dev`. [WARNING]  async_io requires the libraries: ['libaio-dev'] but are missing. Can be fixed by: `apt install libaio-dev`. - -async_io async_io............... ...............[NO] [NO]....... .......[NO] -[NO] -transformer_inference .. [NO] ....... [OKAY] -transformer_inference utils.. ..................[NO] [YES]....... ......[OKAY] -[OKAY] -quantizer ..............utils [NO].................. .......[YES] [OKAY]...... - [OKAY] --------------------------------------------------- -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- --------------------------------------------------- ---------------------------------------------------DeepSpeed C++/CUDA extension op report - ---------------------------------------------------DeepSpeed C++/CUDA extension op report - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.-------------------------------------------------- --------------------------------------------------- ---------------------------------------------------NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - - -JIT compiled ops requires ninja--------------------------------------------------DeepSpeed C++/CUDA extension op report - --------------------------------------------------- -JIT compiled ops requires ninja - --------------------------------------------------- -DeepSpeed C++/CUDA extension op report -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.-------------------------------------------------- - ---------------------------------------------------NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - -JIT compiled ops requires ninja-------------------------------------------------- - -JIT compiled ops requires ninja - [WARNING]  async_io requires the libraries: ['libaio-dev'] but are missing. Can be fixed by: `apt install libaio-dev`. - [WARNING]  async_io requires the libraries: ['libaio-dev'] but are missing. Can be fixed by: `apt install libaio-dev`. -async_io ............... [NO] async_io....... ...............[NO] -[NO] ....... [NO] -transformer_inference .. [NO] ....... transformer_inference[OKAY] -.. [NO] ....... [OKAY] -utils .................. [YES] ...... utils[OKAY] -.................. [YES] ...... quantizer[OKAY] -.............. [NO] .......quantizer [OKAY].............. - [NO] ....... [OKAY]-------------------------------------------------- - --------------------------------------------------- - [WARNING]  async_io requires the libraries: ['libaio-dev'] but are missing. Can be fixed by: `apt install libaio-dev`. [WARNING]  async_io requires the libraries: ['libaio-dev'] but are missing. Can be fixed by: `apt install libaio-dev`. - -async_io async_io............... [NO]............... .......[NO] [NO]....... - [NO] -transformer_inference ..transformer_inference [NO].. .......[NO] [OKAY]....... - [OKAY] -utils .................. utils[YES] .................. ......[YES] [OKAY]...... - [OKAY] -quantizer .............. quantizer[NO] ..................... [NO][OKAY] -....... [OKAY] --------------------------------------------------- --------------------------------------------------- -ninjaninjaninja ....................................ninja ..................[OKAY] - [OKAY][OKAY]--------------------------------------------------.................. - - - [OKAY]--------------------------------------------------op name --------------------------------------------------- - --------------------------------------------------op name -................ - op nameinstalled................op name .................. installed................ installedcompatible .. installed -.. --------------------------------------------------compatible .. - -compatible --------------------------------------------------compatible - - --------------------------------------------------- --------------------------------------------------- -cpu_adam ............... [YES] ...... cpu_adamcpu_adam[OKAY]cpu_adam - ............................................. [YES][YES][YES] fused_adam...... ...... ................... [OKAY] -[OKAY][NO][OKAY] - -....... [OKAY] - [WARNING]  async_io requires the libraries: ['libaio-dev'] but are missing. Can be fixed by: `apt install libaio-dev`. [WARNING]  async_io requires the libraries: ['libaio-dev'] but are missing. Can be fixed by: `apt install libaio-dev`. - -fused_lambfused_adam ............. [NO]............. fused_adam fused_adam....... [NO] ............. [OKAY].................... -async_ioasync_io .............................. [NO][NO] .............. [NO][NO] - - [NO][NO][OKAY] -.............. [OKAY][OKAY]fused_lamb - -transformer_inference .. transformer_inference[NO] ......... [NO][OKAY] - .............sparse_attn [NO]fused_lamb fused_lamb................... ............. .............[NO] [OKAY] [NO]....... -....... [OKAY] -[NO] ....... [OKAY] -.......[OKAY] -utils .................. utils[YES] ........................ [YES][OKAY] -...... [OKAY] -transformer[OKAY] -quantizer quantizer.............. ..............[NO] [NO]....... .......[OKAY] -[OKAY] -............ [NO] sparse_attn....... [OKAY]............ --------------------------------------------------- --------------------------------------------------- - [NO]sparse_attn ....... stochastic_transformersparse_attn ............ ............[OKAY] . - [NO][NO][NO]transformer ....... ................... .......[NO] ....... [OKAY][OKAY] [OKAY] - - -[OKAY] -transformertransformer ........................ stochastic_transformer [NO] [NO] ............... [OKAY][NO][OKAY] - -....... [OKAY] -stochastic_transformerstochastic_transformer .. [NO][NO] .............. [OKAY][OKAY] - - [WARNING]  async_io requires the libraries: ['libaio-dev'] but are missing. Can be fixed by: `apt install libaio-dev`. [WARNING]  async_io requires the libraries: ['libaio-dev'] but are missing. Can be fixed by: `apt install libaio-dev`. - -async_ioasync_io .............................. [NO][NO] .............. [NO][NO] - -transformer_inferencetransformer_inference .... [NO][NO] .............. [OKAY][OKAY] - -utilsutils .................................... [YES][YES] ............ [OKAY][OKAY] - -quantizerquantizer ............................ [NO][NO] .............. [OKAY][OKAY] - ----------------------------------------------------------------------------------------------------- - - [WARNING]  async_io requires the libraries: ['libaio-dev'] but are missing. Can be fixed by: `apt install libaio-dev`. -async_io ............... [NO]  [WARNING]  async_io requires the libraries: ['libaio-dev'] but are missing. Can be fixed by: `apt install libaio-dev`........ [NO] - -transformer_inference .. [NO] ....... [OKAY]async_io - ............... [NO] ....... utils[NO] -.................. [YES] ...... [OKAY] -quantizer transformer_inference.............. ..[NO] [NO]....... .......[OKAY] -[OKAY] --------------------------------------------------- -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io requires the libraries: ['libaio-dev'] but are missing. Can be fixed by: `apt install libaio-dev`. [WARNING]  async_io requires the libraries: ['libaio-dev'] but are missing. Can be fixed by: `apt install libaio-dev`. - -async_ioasync_io .............................. [NO][NO] .............. [NO][NO] - -transformer_inference .. [NO] transformer_inference....... ..[OKAY] -[NO] ....... [OKAY] -utils .................. utils[YES] ........................ [YES][OKAY] -...... [OKAY] -quantizerquantizer ............................ [NO] [NO]....... .......[OKAY] -[OKAY] ----------------------------------------------------------------------------------------------------- - -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/tr1-13B/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr1-13B/DeepSpeed-big-science/deepspeed'] -deepspeed info ................... 0.4.2+bc17042, bc17042, big-science -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/tr1-13B/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr1-13B/DeepSpeed-big-science/deepspeed'] -deepspeed info ................... 0.4.2+bc17042, bc17042, big-science -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/tr1-13B/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr1-13B/DeepSpeed-big-science/deepspeed'] -deepspeed info ................... 0.4.2+bc17042, bc17042, big-science -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/tr1-13B/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr1-13B/DeepSpeed-big-science/deepspeed'] -deepspeed info ................... 0.4.2+bc17042, bc17042, big-science -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/tr1-13B/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr1-13B/DeepSpeed-big-science/deepspeed'] -deepspeed info ................... 0.4.2+bc17042, bc17042, big-science -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info:DeepSpeed general environment info: - -torch install pathtorch install path .............................. ['/gpfswork/rech/six/commun/conda/tr1-13B/lib/python3.8/site-packages/torch']['/gpfswork/rech/six/commun/conda/tr1-13B/lib/python3.8/site-packages/torch'] - -torch versiontorch version ........................................ 1.8.11.8.1 - -torch cuda versiontorch cuda version .............................. 11.111.1 - -nvcc versionnvcc version .......................................... 11.211.2 - -deepspeed install pathdeepspeed install path ...................... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr1-13B/DeepSpeed-big-science/deepspeed']['/gpfsssd/worksf/projects/rech/six/commun/code/tr1-13B/DeepSpeed-big-science/deepspeed'] - -deepspeed infodeepspeed info ...................................... 0.4.2+bc17042, bc17042, big-science0.4.2+bc17042, bc17042, big-science - -deepspeed wheel compiled w.deepspeed wheel compiled w. ............ torch 1.8, cuda 11.1torch 1.8, cuda 11.1 - -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/tr1-13B/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr1-13B/DeepSpeed-big-science/deepspeed'] -deepspeed info ................... 0.4.2+bc17042, bc17042, big-science -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/tr1-13B/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr1-13B/DeepSpeed-big-science/deepspeed'] -deepspeed info ................... 0.4.2+bc17042, bc17042, big-science -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/tr1-13B/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr1-13B/DeepSpeed-big-science/deepspeed'] -deepspeed info ................... 0.4.2+bc17042, bc17042, big-science -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -DeepSpeed general environment info: -torch install path ............... torch install path ............... ['/gpfswork/rech/six/commun/conda/tr1-13B/lib/python3.8/site-packages/torch'] -['/gpfswork/rech/six/commun/conda/tr1-13B/lib/python3.8/site-packages/torch']torch version -.................... torch version1.8.1 -.................... torch cuda version1.8.1 -............... torch cuda version11.1 -...............nvcc version 11.1..................... - nvcc version11.2 -.....................deepspeed install path 11.2........... - deepspeed install path['/gpfsssd/worksf/projects/rech/six/commun/code/tr1-13B/DeepSpeed-big-science/deepspeed'] -........... deepspeed info ['/gpfsssd/worksf/projects/rech/six/commun/code/tr1-13B/DeepSpeed-big-science/deepspeed']................... - 0.4.2+bc17042, bc17042, big-sciencedeepspeed info - ...................deepspeed wheel compiled w. 0.4.2+bc17042, bc17042, big-science...... - torch 1.8, cuda 11.1deepspeed wheel compiled w. - ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info:DeepSpeed general environment info: - -torch install pathtorch install path .............................. ['/gpfswork/rech/six/commun/conda/tr1-13B/lib/python3.8/site-packages/torch']['/gpfswork/rech/six/commun/conda/tr1-13B/lib/python3.8/site-packages/torch'] - -torch versiontorch version ........................................ 1.8.11.8.1 - -torch cuda versiontorch cuda version .............................. 11.111.1 - -nvcc versionnvcc version .......................................... 11.211.2 - -deepspeed install pathdeepspeed install path ...................... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr1-13B/DeepSpeed-big-science/deepspeed']['/gpfsssd/worksf/projects/rech/six/commun/code/tr1-13B/DeepSpeed-big-science/deepspeed'] - -deepspeed infodeepspeed info ...................................... 0.4.2+bc17042, bc17042, big-science0.4.2+bc17042, bc17042, big-science - -deepspeed wheel compiled w.deepspeed wheel compiled w. ............ torch 1.8, cuda 11.1torch 1.8, cuda 11.1 - -DeepSpeed general environment info:DeepSpeed general environment info: - -torch install pathtorch install path .............................. ['/gpfswork/rech/six/commun/conda/tr1-13B/lib/python3.8/site-packages/torch'] -['/gpfswork/rech/six/commun/conda/tr1-13B/lib/python3.8/site-packages/torch'] -torch version ....................torch version 1.8.1.................... - 1.8.1 -torch cuda version ...............torch cuda version 11.1............... - nvcc version11.1 -..................... nvcc version11.2 -.....................deepspeed install path 11.2........... - deepspeed install path ['/gpfsssd/worksf/projects/rech/six/commun/code/tr1-13B/DeepSpeed-big-science/deepspeed']........... - deepspeed info ['/gpfsssd/worksf/projects/rech/six/commun/code/tr1-13B/DeepSpeed-big-science/deepspeed']................... - deepspeed info0.4.2+bc17042, bc17042, big-science -...................deepspeed wheel compiled w. 0.4.2+bc17042, bc17042, big-science...... - deepspeed wheel compiled w.torch 1.8, cuda 11.1 -...... torch 1.8, cuda 11.1 --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.---------------------------------------------------------------------------------------------------- ----------------------------------------------------------------------------------------------------- - - - -DeepSpeed C++/CUDA extension op reportJIT compiled ops requires ninjaDeepSpeed C++/CUDA extension op reportDeepSpeed C++/CUDA extension op report - - - ------------------------------------------------------------------------------------------------------------------------------------------------------- - - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - - ----------------------------------------------------------------------------------------------------- --------------------------------------------------- -JIT compiled ops requires ninja -JIT compiled ops requires ninja -JIT compiled ops requires ninja - -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -DeepSpeed general environment info:DeepSpeed general environment info: - -torch install path ............... torch install path ............... ['/gpfswork/rech/six/commun/conda/tr1-13B/lib/python3.8/site-packages/torch'] -['/gpfswork/rech/six/commun/conda/tr1-13B/lib/python3.8/site-packages/torch']torch version - .................... torch version1.8.1 -.................... torch cuda version1.8.1 -............... torch cuda version11.1 -...............nvcc version 11.1..................... - nvcc version11.2 -.....................deepspeed install path 11.2........... - deepspeed install path ['/gpfsssd/worksf/projects/rech/six/commun/code/tr1-13B/DeepSpeed-big-science/deepspeed']........... - deepspeed info['/gpfsssd/worksf/projects/rech/six/commun/code/tr1-13B/DeepSpeed-big-science/deepspeed'] -................... deepspeed info0.4.2+bc17042, bc17042, big-science -................... deepspeed wheel compiled w.0.4.2+bc17042, bc17042, big-science -...... deepspeed wheel compiled w.torch 1.8, cuda 11.1 -...... torch 1.8, cuda 11.1 -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -ninjaninjaninja ninja.................................... [OKAY]....................................[OKAY] - - --------------------------------------------------[OKAY]-------------------------------------------------- -[OKAY] - -op name ---------------------------------------------------op name -------------------------------------------------- -................ - ................op nameinstalled op name ..installed ................ ................compatible .. -/bin/sh: line 0: type: git: not found -installed installed-------------------------------------------------- .. compatiblecompatible - - -..---------------------------------------------------------------------------------------------------- - -compatible --------------------------------------------------- -cpu_adam ............... cpu_adam[YES] cpu_adam......cpu_adam ............... [OKAY]...............[YES]............... - [YES][YES]...... ...... ...... [OKAY] [OKAY] -fused_adam[OKAY] - -............. [NO] ....... [OKAY] -fused_adamfused_lamb fused_adam..........................fused_adam .............[NO].............[NO] ....... [NO].......[NO] [OKAY].............. -[OKAY] [OKAY] - -[OKAY]fused_lamb -/bin/sh: line 0: type: git: not found - fused_lamb............. fused_lamb.............[NO] sparse_attn.............[NO]....... ............[NO].......[OKAY] -[NO] .......[OKAY] ....... - [OKAY][OKAY] - -transformer ............ sparse_attn[NO] .......sparse_attn............ [OKAY][NO]sparse_attn............ - [NO] ....... ............stochastic_transformer.......[OKAY] [NO] -[OKAY] . -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** - transformer.......[NO]transformer [OKAY]............................... - [OKAY][NO][NO] -transformer .......................... [OKAY][NO][OKAY] - -.......stochastic_transformer stochastic_transformer [OKAY] -. .[NO] stochastic_transformer[NO]....... ....... [OKAY] -[OKAY]. - [NO] ....... [OKAY] -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/tr1-13B/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr1-13B/DeepSpeed-big-science/deepspeed'] -deepspeed info ................... 0.4.2+bc17042, bc17042, big-science -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/tr1-13B/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr1-13B/DeepSpeed-big-science/deepspeed'] -deepspeed info ................... 0.4.2+bc17042, bc17042, big-science -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io requires the libraries: ['libaio-dev'] but are missing. Can be fixed by: `apt install libaio-dev`. - [WARNING]  async_io requires the libraries: ['libaio-dev'] but are missing. Can be fixed by: `apt install libaio-dev`. -async_io ............... [NO] async_io....... [NO]............... - [NO] ....... [NO] -transformer_inference .. [NO] transformer_inference....... ..[OKAY] -[NO] ....... [OKAY] -utils .................. [YES] utils...... ..................[OKAY] -[YES] ...... [OKAY]quantizer - .............. [NO] quantizer....... ..............[OKAY] -[NO] ....... [OKAY]-------------------------------------------------- - --------------------------------------------------- - [WARNING]  async_io requires the libraries: ['libaio-dev'] but are missing. Can be fixed by: `apt install libaio-dev`. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io requires the libraries: ['libaio-dev'] but are missing. Can be fixed by: `apt install libaio-dev`. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed general environment info: -DeepSpeed general environment info:torch install path -............... torch install path ...............['/gpfswork/rech/six/commun/conda/tr1-13B/lib/python3.8/site-packages/torch'] -torch version .................... ['/gpfswork/rech/six/commun/conda/tr1-13B/lib/python3.8/site-packages/torch']1.8.1 - -torch cuda versiontorch version ................................... 11.11.8.1 - -nvcc version .....................torch cuda version 11.2............... - deepspeed install path11.1 -...........nvcc version .....................['/gpfsssd/worksf/projects/rech/six/commun/code/tr1-13B/DeepSpeed-big-science/deepspeed'] -11.2 -deepspeed infodeepspeed install path .............................. 0.4.2+bc17042, bc17042, big-science -['/gpfsssd/worksf/projects/rech/six/commun/code/tr1-13B/DeepSpeed-big-science/deepspeed']deepspeed wheel compiled w. - ......deepspeed info torch 1.8, cuda 11.1................... - 0.4.2+bc17042, bc17042, big-science -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ...............DeepSpeed general environment info: -['/gpfswork/rech/six/commun/conda/tr1-13B/lib/python3.8/site-packages/torch']torch install path - ............... torch version .................... 1.8.1 -['/gpfswork/rech/six/commun/conda/tr1-13B/lib/python3.8/site-packages/torch'] -torch cuda version ............... torch version11.1 -....................nvcc version 1.8.1..................... - 11.2 -torch cuda versiondeepspeed install path .......................... 11.1 -['/gpfsssd/worksf/projects/rech/six/commun/code/tr1-13B/DeepSpeed-big-science/deepspeed']nvcc version - .....................deepspeed info 11.2................... - deepspeed install path0.4.2+bc17042, bc17042, big-science -........... deepspeed wheel compiled w. ['/gpfsssd/worksf/projects/rech/six/commun/code/tr1-13B/DeepSpeed-big-science/deepspeed']...... -torch 1.8, cuda 11.1deepspeed info - ................... 0.4.2+bc17042, bc17042, big-science -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/tr1-13B/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr1-13B/DeepSpeed-big-science/deepspeed'] -deepspeed info ................... 0.4.2+bc17042, bc17042, big-science -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/tr1-13B/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr1-13B/DeepSpeed-big-science/deepspeed'] -deepspeed info ................... 0.4.2+bc17042, bc17042, big-science -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info:DeepSpeed general environment info: - -torch install pathtorch install path .............................. ['/gpfswork/rech/six/commun/conda/tr1-13B/lib/python3.8/site-packages/torch']['/gpfswork/rech/six/commun/conda/tr1-13B/lib/python3.8/site-packages/torch'] - -torch versiontorch version ........................................ 1.8.11.8.1 - -torch cuda versiontorch cuda version .............................. 11.111.1 - -nvcc versionnvcc version .......................................... 11.211.2 - -deepspeed install pathdeepspeed install path ...................... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr1-13B/DeepSpeed-big-science/deepspeed']['/gpfsssd/worksf/projects/rech/six/commun/code/tr1-13B/DeepSpeed-big-science/deepspeed'] - -deepspeed infodeepspeed info ...................................... 0.4.2+bc17042, bc17042, big-science0.4.2+bc17042, bc17042, big-science - -deepspeed wheel compiled w.deepspeed wheel compiled w. ............ torch 1.8, cuda 11.1torch 1.8, cuda 11.1 - -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/tr1-13B/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr1-13B/DeepSpeed-big-science/deepspeed'] -deepspeed info ................... 0.4.2+bc17042, bc17042, big-science -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/tr1-13B/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr1-13B/DeepSpeed-big-science/deepspeed'] -deepspeed info ................... 0.4.2+bc17042, bc17042, big-science -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/tr1-13B/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr1-13B/DeepSpeed-big-science/deepspeed'] -deepspeed info ................... 0.4.2+bc17042, bc17042, big-science -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/tr1-13B/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr1-13B/DeepSpeed-big-science/deepspeed'] -deepspeed info ................... 0.4.2+bc17042, bc17042, big-science -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -/bin/sh: line 0: type: git: not found -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -DeepSpeed general environment info:DeepSpeed general environment info: - -torch install pathtorch install path .............................. ['/gpfswork/rech/six/commun/conda/tr1-13B/lib/python3.8/site-packages/torch']['/gpfswork/rech/six/commun/conda/tr1-13B/lib/python3.8/site-packages/torch'] - -torch versiontorch version ........................................ 1.8.11.8.1 - -torch cuda versiontorch cuda version .............................. 11.111.1 - -nvcc versionnvcc version .......................................... 11.211.2 - -deepspeed install path deepspeed install path........... ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr1-13B/DeepSpeed-big-science/deepspeed'] -['/gpfsssd/worksf/projects/rech/six/commun/code/tr1-13B/DeepSpeed-big-science/deepspeed']deepspeed info - deepspeed info................... ...................0.4.2+bc17042, bc17042, big-science -0.4.2+bc17042, bc17042, big-science -deepspeed wheel compiled w.deepspeed wheel compiled w. ............ torch 1.8, cuda 11.1torch 1.8, cuda 11.1 - -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** --------------------------------------------------- -DeepSpeed C++/CUDA extension op report ----------------------------------------------------------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. ---------------------------------------------------DeepSpeed C++/CUDA extension op report - - -------------------------------------------------------------------------------------------------------------------------------------------------------DeepSpeed C++/CUDA extension op report - - - -JIT compiled ops requires ninjaNOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.DeepSpeed C++/CUDA extension op report-------------------------------------------------- - - - --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.--------------------------------------------------JIT compiled ops requires ninja - - --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.JIT compiled ops requires ninja - --------------------------------------------------- -JIT compiled ops requires ninja - [WARNING]  async_io requires the libraries: ['libaio-dev'] but are missing. Can be fixed by: `apt install libaio-dev`. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] [WARNING]  async_io requires the libraries: ['libaio-dev'] but are missing. Can be fixed by: `apt install libaio-dev`. - -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] -async_io-------------------------------------------------- -............... [NO] ....... [NO] -ninjaninjaninjaninja ........................................................................ [OKAY] [OKAY][OKAY] -[OKAY] - - ----------------------------------------------------------------------------------------------------- - -transformer_inference .. [NO] ....... [OKAY] ---------------------------------------------------op name-------------------------------------------------- -op name -................ op name ................op name installed installed ................ .................. ..installed installed compatible....compatible - -utils .................. [YES] ...... [OKAY] -----------------------------------------------------------------------------------------------------compatiblecompatible - - - ----------------------------------------------------------------------------------------------------- - -quantizer .............. [NO] ....... [OKAY] -cpu_adamcpu_adam ..............................cpu_adamcpu_adam [YES][YES].............................. ............[YES][YES] [OKAY][OKAY]............ - --------------------------------------------------- - [OKAY][OKAY] - -fused_adamfused_adam .......................... fused_adam[NO]fused_adam [NO] ........................................ [NO][OKAY][OKAY] -[NO] -....... .......fused_lambfused_lamb[OKAY] -[OKAY]..........................fused_lamb - [NO][NO]fused_lamb............. ........................... [NO] [OKAY] -[OKAY][NO]....... - .......[OKAY] -[OKAY] -sparse_attnsparse_attn ............sparse_attn............sparse_attn [NO][NO] ............ .......................... [NO] [NO][OKAY] [OKAY] - ....... -....... transformer [OKAY] [OKAY]transformer -............ - ............transformer[NO] [NO]transformer................... ....... [NO][OKAY] ............ -[OKAY] -.......[NO] stochastic_transformer [OKAY]stochastic_transformer ....... - .. [OKAY] stochastic_transformer[NO] -[NO] .............. stochastic_transformer [OKAY]. [OKAY] - -.[NO] [NO]....... .......[OKAY] -[OKAY] --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.-------------------------------------------------- --------------------------------------------------- - ---------------------------------------------------JIT compiled ops requires ninjaDeepSpeed C++/CUDA extension op report - - -DeepSpeed C++/CUDA extension op report ----------------------------------------------------------------------------------------------------- - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - ----------------------------------------------------------------------------------------------------- - -JIT compiled ops requires ninjaJIT compiled ops requires ninja - --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja - [WARNING]  async_io requires the libraries: ['libaio-dev'] but are missing. Can be fixed by: `apt install libaio-dev`. [WARNING]  async_io requires the libraries: ['libaio-dev'] but are missing. Can be fixed by: `apt install libaio-dev`. - -async_io async_io............... ...............[NO] [NO]....... .......[NO] -[NO] -transformer_inference .. [NO] ....... [OKAY]transformer_inference - .. [NO] ....... [OKAY]utils - .................. [YES] ...... [OKAY] -utils ..................quantizer [YES] .................... [NO] ....... [OKAY][OKAY] - --------------------------------------------------- -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -/bin/sh: line 0: type: git: not found -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown ******** Git info for Megatron: git_hash=unknown git_branch=unknown **** - -ninjaninjaninjaninja ........................................................................ [OKAY][OKAY][OKAY][OKAY] - - - ------------------------------------------------------------------------------------------------------------------------------------------------------- - --------------------------------------------------- - -op nameop nameop name op name ................................ ................ ................ installed installedinstalled.. installed .. compatible .. -..compatible -------------------------------------------------- - -compatible--------------------------------------------------compatible - - ----------------------------------------------------------------------------------------------------- - -cpu_adam ............... [YES]cpu_adam ......cpu_adam cpu_adam...............[OKAY] - ...............[YES]............... [YES]......[YES] ............ [OKAY][OKAY] - - [OKAY]fused_adam - ............. [NO] ....... [OKAY] -fused_adamfused_adam ..........................fused_lamb [NO][NO]fused_adam............. .......[NO] .................... [OKAY] .......[NO] -[OKAY] [OKAY] - -fused_lamb....... [OKAY]............. -fused_lamb [NO]............. fused_lamb ....... [NO] ............. [OKAY] .......[NO] -sparse_attn [OKAY]............ - .......[NO] [OKAY]....... - [OKAY] -sparse_attn transformer............ ............ sparse_attn [NO] [NO] ............ ....... ....... [NO][OKAY][OKAY]sparse_attn - - ....... transformerstochastic_transformer............ [OKAY]............ [NO] - .[NO].......transformer [NO] [OKAY]....... ............ -....... [NO][OKAY][OKAY] - - transformer....... stochastic_transformer ............ [OKAY] -[NO]. [NO] ....... .......stochastic_transformer[OKAY] -[OKAY] -. [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] -/bin/sh: line 0: type: git: not found -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -/bin/sh: line 0: type: git: not found -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown ******** Git info for Megatron: git_hash=unknown git_branch=unknown **** - -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -/bin/sh: line 0: type: git: not found -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown ******** Git info for Megatron: git_hash=unknown git_branch=unknown **** - -/bin/sh: line 0: type: git: not found -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown ******** Git info for Megatron: git_hash=unknown git_branch=unknown **** - -/bin/sh: line 0: type: git: not found -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -/bin/sh: line 0: type: git: not found -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown ******** Git info for Megatron: git_hash=unknown git_branch=unknown **** - -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -DeepSpeed general environment info:DeepSpeed general environment info: - -torch install pathtorch install path .............................. ['/gpfswork/rech/six/commun/conda/tr1-13B/lib/python3.8/site-packages/torch']['/gpfswork/rech/six/commun/conda/tr1-13B/lib/python3.8/site-packages/torch'] - -torch versiontorch version ........................................ 1.8.11.8.1 - -torch cuda versiontorch cuda version .............................. 11.111.1 - -nvcc versionnvcc version .......................................... 11.211.2 - -deepspeed install pathdeepspeed install path ...................... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr1-13B/DeepSpeed-big-science/deepspeed']['/gpfsssd/worksf/projects/rech/six/commun/code/tr1-13B/DeepSpeed-big-science/deepspeed'] - -deepspeed infodeepspeed info ...................................... 0.4.2+bc17042, bc17042, big-science0.4.2+bc17042, bc17042, big-science - -deepspeed wheel compiled w.deepspeed wheel compiled w. ............ torch 1.8, cuda 11.1torch 1.8, cuda 11.1 - -/bin/sh: line 0: type: git: not found -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown ******** Git info for Megatron: git_hash=unknown git_branch=unknown **** - -DeepSpeed general environment info:DeepSpeed general environment info: - -torch install pathtorch install path ............... ............... ['/gpfswork/rech/six/commun/conda/tr1-13B/lib/python3.8/site-packages/torch']['/gpfswork/rech/six/commun/conda/tr1-13B/lib/python3.8/site-packages/torch'] - -torch version torch version.................... ....................1.8.1 -1.8.1 -torch cuda version torch cuda version............... ...............11.1 -11.1nvcc version - nvcc version..................... .....................11.2 -11.2deepspeed install path - deepspeed install path........... ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr1-13B/DeepSpeed-big-science/deepspeed'] -['/gpfsssd/worksf/projects/rech/six/commun/code/tr1-13B/DeepSpeed-big-science/deepspeed']deepspeed info -...................deepspeed info 0.4.2+bc17042, bc17042, big-science................... - deepspeed wheel compiled w.0.4.2+bc17042, bc17042, big-science -...... deepspeed wheel compiled w.torch 1.8, cuda 11.1 -...... torch 1.8, cuda 11.1 -/bin/sh: line 0: type: git: not found -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/tr1-13B/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr1-13B/DeepSpeed-big-science/deepspeed'] -deepspeed info ................... 0.4.2+bc17042, bc17042, big-science -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -/bin/sh: line 0: type: git: not found -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/tr1-13B/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr1-13B/DeepSpeed-big-science/deepspeed'] -deepspeed info ................... 0.4.2+bc17042, bc17042, big-science -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -/bin/sh: line 0: type: git: not found -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown ******** Git info for Megatron: git_hash=unknown git_branch=unknown **** - --------------------------------------------------- ---------------------------------------------------DeepSpeed C++/CUDA extension op report - -----------------------------------------------------------------------------------------------------DeepSpeed C++/CUDA extension op report - - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.--------------------------------------------------DeepSpeed C++/CUDA extension op report - - ---------------------------------------------------NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.-------------------------------------------------- - - -JIT compiled ops requires ninja--------------------------------------------------NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - - -JIT compiled ops requires ninja-------------------------------------------------- - -JIT compiled ops requires ninja --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja -ninjaninjaninjaninja ...................................................... ..................[OKAY] -[OKAY][OKAY][OKAY] - --------------------------------------------------- - ------------------------------------------------------------------------------------------------------------------------------------------------------- - -op name -op name op name op name................ ................ ................ ................installed installedinstalled installed .. .... .. compatiblecompatible compatible - -compatible ----------------------------------------------------------------------------------------------------- - - --------------------------------------------------- --------------------------------------------------- -cpu_adam cpu_adam............... cpu_adam ............... [YES] ............... [YES] ...... [YES] ...... [OKAY] ...... -cpu_adam [OKAY] [OKAY] -............... - [YES]fused_adam ................... fused_adam fused_adam[OKAY][NO] - .......................... ....... [NO] [NO] [OKAY] ....... -....... [OKAY][OKAY] -fused_lamb - .............fused_lamb [NO]fused_lamb............. .......[NO] ............. [OKAY] ....... -[NO]fused_adam [OKAY].................... - [OKAY][NO] - ....... sparse_attn[OKAY] -............sparse_attn sparse_attn[NO]............ ...................[NO] [NO] fused_lamb[OKAY] ....... ....... - [OKAY][OKAY] - -.............transformer transformertransformer [NO] .................................... [NO][NO][NO] ............................ [OKAY][OKAY][OKAY][OKAY] - - - -stochastic_transformer stochastic_transformerstochastic_transformer. [NO].. .......[NO][NO] [OKAY].............. - [OKAY][OKAY] - -sparse_attn ............ [NO] ....... [OKAY] -DeepSpeed general environment info:DeepSpeed general environment info: - -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] -torch install pathtorch install path .............................. ['/gpfswork/rech/six/commun/conda/tr1-13B/lib/python3.8/site-packages/torch']['/gpfswork/rech/six/commun/conda/tr1-13B/lib/python3.8/site-packages/torch'] -torch version .................... - 1.8.1 -torch version ....................torch cuda version 1.8.1............... - 11.1 -torch cuda versionnvcc version .................................... 11.111.2 - -nvcc versiondeepspeed install path ................................ 11.2 -['/gpfsssd/worksf/projects/rech/six/commun/code/tr1-13B/DeepSpeed-big-science/deepspeed']deepspeed install path - deepspeed info ................... 0.4.2+bc17042, bc17042, big-science........... -deepspeed wheel compiled w. ...... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr1-13B/DeepSpeed-big-science/deepspeed']torch 1.8, cuda 11.1 - -deepspeed info ................... 0.4.2+bc17042, bc17042, big-science -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/tr1-13B/lib/python3.8/site-packages/torch'] -DeepSpeed general environment info:torch version - .................... 1.8.1 -torch install pathtorch cuda version .............................. 11.1 -nvcc version ..................... 11.2 -['/gpfswork/rech/six/commun/conda/tr1-13B/lib/python3.8/site-packages/torch']deepspeed install path - ........... torch version ['/gpfsssd/worksf/projects/rech/six/commun/code/tr1-13B/DeepSpeed-big-science/deepspeed'].................... - deepspeed info1.8.1 -................... 0.4.2+bc17042, bc17042, big-sciencetorch cuda version - deepspeed wheel compiled w................ ......11.1 -torch 1.8, cuda 11.1nvcc version - ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr1-13B/DeepSpeed-big-science/deepspeed'] -deepspeed info ................... 0.4.2+bc17042, bc17042, big-science -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io requires the libraries: ['libaio-dev'] but are missing. Can be fixed by: `apt install libaio-dev`. - [WARNING]  async_io requires the libraries: ['libaio-dev'] but are missing. Can be fixed by: `apt install libaio-dev`. -async_io ............... [NO] ....... [NO] -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -transformer_inference .. [NO] utils....... ..................[OKAY] -[YES] ...... [OKAY] -utils ..................quantizer [YES].............. ......[NO] [OKAY]....... - [OKAY] -quantizer-------------------------------------------------- -.............. [NO] ....... [OKAY] --------------------------------------------------- ------------------------------------------------------------------------------------------------------------------------------------------------------- - - -DeepSpeed C++/CUDA extension op report--------------------------------------------------DeepSpeed C++/CUDA extension op reportDeepSpeed C++/CUDA extension op report - - - ----------------------------------------------------------------------------------------------------- --------------------------------------------------- -DeepSpeed C++/CUDA extension op report -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- - --------------------------------------------------- ----------------------------------------------------------------------------------------------------- - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. -JIT compiled ops requires ninjaJIT compiled ops requires ninja - --------------------------------------------------- -JIT compiled ops requires ninja - -JIT compiled ops requires ninja - [WARNING]  async_io requires the libraries: ['libaio-dev'] but are missing. Can be fixed by: `apt install libaio-dev`. [WARNING]  async_io requires the libraries: ['libaio-dev'] but are missing. Can be fixed by: `apt install libaio-dev`. - -async_ioasync_io .............................. [NO][NO] .............. [NO][NO] - -transformer_inference transformer_inference.. ..[NO] [NO]....... .......[OKAY] -[OKAY] -utilsutils .................................... [YES][YES] ............ [OKAY][OKAY] - -quantizerquantizer ............................ [NO][NO] .............. [OKAY][OKAY] - ----------------------------------------------------------------------------------------------------- - - [WARNING]  async_io requires the libraries: ['libaio-dev'] but are missing. Can be fixed by: `apt install libaio-dev`. [WARNING]  async_io requires the libraries: ['libaio-dev'] but are missing. Can be fixed by: `apt install libaio-dev`. - -async_io ............... [NO] ....... async_io[NO] - ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -transformer_inference ..utils [NO].................. .......[YES] [OKAY]...... - [OKAY] -utilsquantizer ................................ [NO][YES] ............. [OKAY][OKAY] - --------------------------------------------------- -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io requires the libraries: ['libaio-dev'] but are missing. Can be fixed by: `apt install libaio-dev`. [WARNING]  async_io requires the libraries: ['libaio-dev'] but are missing. Can be fixed by: `apt install libaio-dev`. - -async_io ...............async_io [NO] ...................... [NO][NO] -....... [NO] -transformer_inferencetransformer_inference .... [NO][NO] .............. [OKAY][OKAY] - -utils utils.................. ..................[YES] [YES]...... ......[OKAY] -[OKAY] -quantizer quantizer.............. ..............[NO] [NO]....... .......[OKAY] -[OKAY] ----------------------------------------------------------------------------------------------------- - -ninjaninjaninjaninja .................. .................. .................. [OKAY].................. [OKAY] - [OKAY]-------------------------------------------------- -[OKAY] - - ---------------------------------------------------op name-------------------------------------------------- - -------------------------------------------------- -op name................ - op name ................op name installed ................ installed ................installed.... installed..compatiblecompatible - -compatible..-------------------------------------------------- --------------------------------------------------- --------------------------------------------------- - -compatible --------------------------------------------------- -cpu_adamcpu_adamcpu_adam cpu_adam.............................. ............... ...............[YES][YES] [YES]......[YES]...... ......[OKAY]......[OKAY] -[OKAY] -[OKAY] - -fused_adam fused_adamfused_adamfused_adam............. [NO].......................... ............. .......[NO] [NO] [OKAY][NO].............. -....... [OKAY] fused_lamb[OKAY] - -[OKAY] -.............fused_lamb [NO]fused_lamb fused_lamb ............. ....... .......................... [NO] [NO].......[OKAY][NO] ....... - [OKAY] ....... -[OKAY] -[OKAY] -sparse_attnsparse_attn sparse_attn ........................ sparse_attn[NO]............ [NO] ............[NO] .............. .......[NO][OKAY][OKAY] - - [OKAY]....... -transformer [OKAY]............transformer -[NO] transformer ....... transformer........................ [OKAY] -............ [NO] [NO] stochastic_transformer[NO] ....... ....... .[OKAY][OKAY]....... - - [NO][OKAY]stochastic_transformer -....... stochastic_transformer .[OKAY] -stochastic_transformer[NO]. .......[NO] . [OKAY] -.......[NO] [OKAY]....... - [OKAY] --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja-------------------------------------------------- - ---------------------------------------------------DeepSpeed C++/CUDA extension op report - --------------------------------------------------- -DeepSpeed C++/CUDA extension op reportNOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - ----------------------------------------------------------------------------------------------------- - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.JIT compiled ops requires ninja - --------------------------------------------------- -JIT compiled ops requires ninja -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -ninjaninjaninjaninja .................. .................................... ..................[OKAY][OKAY][OKAY] - -[OKAY] --------------------------------------------------- ----------------------------------------------------------------------------------------------------- - - -op name--------------------------------------------------op name -op name ................op name ................ ................installed................ installed..installed compatible -..installed.. -------------------------------------------------- compatible -.. -compatible --------------------------------------------------compatible - - ----------------------------------------------------------------------------------------------------- - -cpu_adam ............... [YES] cpu_adam...... ...............[OKAY] cpu_adam -[YES]cpu_adam .................................... [YES][YES][OKAY] fused_adam -...... ...... ............. [OKAY] [OKAY] -[NO] - ....... fused_adam[OKAY] -............. [NO] fused_lamb....... fused_adam[OKAY]fused_adam............. - ..........................[NO]fused_lamb [NO].......[NO]............. .......[NO][OKAY]....... - [OKAY][OKAY] -....... - [OKAY] -fused_lambfused_lamb .......................... sparse_attn[NO][NO] .......................... [NO][OKAY][OKAY] sparse_attn - -....... ............[OKAY] -[NO] ....... transformer[OKAY] -............ [NO]transformersparse_attn ...................sparse_attn ........................[OKAY] [NO] - [NO][NO]....... [OKAY]stochastic_transformer -....... ....... . stochastic_transformer[NO][OKAY] [OKAY] -....... -. transformer [OKAY] transformer[NO] -............ .......[NO]............ [OKAY]....... - [NO][OKAY] -....... [OKAY] -stochastic_transformer stochastic_transformer . .[NO] [NO]....... .......[OKAY] -[OKAY] -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -/bin/sh: line 0: type: git: not found -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown ******** Git info for Megatron: git_hash=unknown git_branch=unknown **** - -/bin/sh: line 0: type: git: not found -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -/bin/sh: line 0: type: git: not found -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** --------------------------------------------------- -DeepSpeed C++/CUDA extension op report-------------------------------------------------- - ----------------------------------------------------------------------------------------------------- -DeepSpeed C++/CUDA extension op report -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.--------------------------------------------------JIT compiled ops requires ninja - - ---------------------------------------------------NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - -JIT compiled ops requires ninja---------------------------------------------------------------------------------------------------- - - -JIT compiled ops requires ninjaDeepSpeed C++/CUDA extension op report - --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja -/bin/sh: line 0: type: git: not found -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -ninjaninjaninjaninja .................. .................................... .................. [OKAY][OKAY] [OKAY] -[OKAY] - - ----------------------------------------------------------------------------------------------------- ----------------------------------------------------------------------------------------------------- - -op name -op name op nameop name................ ................................................installed installedinstalledinstalled .... compatible.. .. -compatible compatiblecompatible --------------------------------------------------- - ----------------------------------------------------------------------------------------------------- --------------------------------------------------- - - -cpu_adamcpu_adam cpu_adam...............cpu_adam ............... [YES]............... ............... [YES][YES]...... [YES] ......[OKAY]...... ...... -[OKAY] [OKAY] -[OKAY] - -fused_adam fused_adam............. fused_adam.............[NO] fused_adam ............. [NO]....... ............. .......[OKAY][NO] - [NO][OKAY]....... - fused_lamb[OKAY]....... - .............fused_lamb[OKAY] fused_lamb[NO] -............. ....................[NO] fused_lamb[NO].......[OKAY] ....... -.............[OKAY] -[OKAY][NO] - ....... [OKAY] -sparse_attn ............ [NO]sparse_attnsparse_attn ............................... [OKAY]sparse_attn[NO][NO] - .......................... transformer [NO] [OKAY] [OKAY] -................... - [OKAY][NO] - transformer.......transformertransformer [OKAY].................................... - [NO][NO] [NO] .............. stochastic_transformer....... [OKAY] [OKAY] -[OKAY]. - - [NO] stochastic_transformer....... stochastic_transformer stochastic_transformer [OKAY] . - ..[NO] [NO][NO]....... ..............[OKAY] -[OKAY][OKAY] - - [WARNING]  async_io requires the libraries: ['libaio-dev'] but are missing. Can be fixed by: `apt install libaio-dev`. [WARNING]  async_io requires the libraries: ['libaio-dev'] but are missing. Can be fixed by: `apt install libaio-dev`. - -async_io ............... async_io[NO] ...................... [NO][NO] -....... [NO] -transformer_inference ..transformer_inference [NO].. .......[NO] [OKAY]....... - [OKAY] -utils .................. utils[YES] ........................ [YES][OKAY] -...... [OKAY] -quantizer quantizer.............. ..............[NO] [NO]....... .......[OKAY] -[OKAY] --------------------------------------------------- --------------------------------------------------- - [WARNING]  async_io requires the libraries: ['libaio-dev'] but are missing. Can be fixed by: `apt install libaio-dev`. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io requires the libraries: ['libaio-dev'] but are missing. Can be fixed by: `apt install libaio-dev`. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed general environment info:DeepSpeed general environment info: - -torch install pathtorch install path .............................. ['/gpfswork/rech/six/commun/conda/tr1-13B/lib/python3.8/site-packages/torch']['/gpfswork/rech/six/commun/conda/tr1-13B/lib/python3.8/site-packages/torch'] - -torch versiontorch version ........................................ 1.8.11.8.1 - -torch cuda versiontorch cuda version .............................. 11.111.1 - -nvcc versionnvcc version .......................................... 11.211.2 - -deepspeed install pathdeepspeed install path ...................... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr1-13B/DeepSpeed-big-science/deepspeed']['/gpfsssd/worksf/projects/rech/six/commun/code/tr1-13B/DeepSpeed-big-science/deepspeed'] - -deepspeed info deepspeed info................... ...................0.4.2+bc17042, bc17042, big-science -0.4.2+bc17042, bc17042, big-science -deepspeed wheel compiled w. deepspeed wheel compiled w....... ......torch 1.8, cuda 11.1 -torch 1.8, cuda 11.1 -DeepSpeed general environment info:DeepSpeed general environment info: - -torch install pathtorch install path .............................. ['/gpfswork/rech/six/commun/conda/tr1-13B/lib/python3.8/site-packages/torch']['/gpfswork/rech/six/commun/conda/tr1-13B/lib/python3.8/site-packages/torch'] - -torch version ....................torch version 1.8.1.................... - 1.8.1 -torch cuda version ...............torch cuda version 11.1............... - 11.1nvcc version - nvcc version..................... .....................11.2 -11.2deepspeed install path - deepspeed install path........... ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr1-13B/DeepSpeed-big-science/deepspeed'] -['/gpfsssd/worksf/projects/rech/six/commun/code/tr1-13B/DeepSpeed-big-science/deepspeed'] -deepspeed infodeepspeed info ...................................... 0.4.2+bc17042, bc17042, big-science0.4.2+bc17042, bc17042, big-science - -deepspeed wheel compiled w.deepspeed wheel compiled w. ............ torch 1.8, cuda 11.1torch 1.8, cuda 11.1 - -DeepSpeed general environment info:DeepSpeed general environment info: - -torch install pathtorch install path .............................. ['/gpfswork/rech/six/commun/conda/tr1-13B/lib/python3.8/site-packages/torch']['/gpfswork/rech/six/commun/conda/tr1-13B/lib/python3.8/site-packages/torch'] - -torch versiontorch version ........................................ 1.8.11.8.1 - -torch cuda versiontorch cuda version .............................. 11.111.1 - -nvcc versionnvcc version .......................................... 11.211.2 - -deepspeed install pathdeepspeed install path ...................... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr1-13B/DeepSpeed-big-science/deepspeed']['/gpfsssd/worksf/projects/rech/six/commun/code/tr1-13B/DeepSpeed-big-science/deepspeed'] - -deepspeed info deepspeed info................... ...................0.4.2+bc17042, bc17042, big-science -0.4.2+bc17042, bc17042, big-science -deepspeed wheel compiled w. deepspeed wheel compiled w....... ......torch 1.8, cuda 11.1 -torch 1.8, cuda 11.1 - [WARNING]  async_io requires the libraries: ['libaio-dev'] but are missing. Can be fixed by: `apt install libaio-dev`. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io requires the libraries: ['libaio-dev'] but are missing. Can be fixed by: `apt install libaio-dev`. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed general environment info:DeepSpeed general environment info: - -torch install pathtorch install path .............................. ['/gpfswork/rech/six/commun/conda/tr1-13B/lib/python3.8/site-packages/torch']['/gpfswork/rech/six/commun/conda/tr1-13B/lib/python3.8/site-packages/torch'] - -torch versiontorch version ........................................ 1.8.11.8.1 - -torch cuda versiontorch cuda version .............................. 11.1 -nvcc version ..................... 11.111.2 - -nvcc versiondeepspeed install path ................................ 11.2 -['/gpfsssd/worksf/projects/rech/six/commun/code/tr1-13B/DeepSpeed-big-science/deepspeed']deepspeed install path - deepspeed info........... ...................['/gpfsssd/worksf/projects/rech/six/commun/code/tr1-13B/DeepSpeed-big-science/deepspeed'] -0.4.2+bc17042, bc17042, big-sciencedeepspeed info - deepspeed wheel compiled w.................... ......0.4.2+bc17042, bc17042, big-science -torch 1.8, cuda 11.1deepspeed wheel compiled w. - ...... torch 1.8, cuda 11.1 - [WARNING]  async_io requires the libraries: ['libaio-dev'] but are missing. Can be fixed by: `apt install libaio-dev`. [WARNING]  async_io requires the libraries: ['libaio-dev'] but are missing. Can be fixed by: `apt install libaio-dev`. - -async_io ............... [NO]async_io ....... ...............[NO] -[NO] ....... [NO] -transformer_inference .. [NO]transformer_inference ....... ..[OKAY] -[NO] ....... [OKAY] -utils .................. [YES] ...... utils[OKAY] -.................. [YES]quantizer .................... [OKAY][NO] - ....... [OKAY] -quantizer .............. --------------------------------------------------[NO] - ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io requires the libraries: ['libaio-dev'] but are missing. Can be fixed by: `apt install libaio-dev`. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io requires the libraries: ['libaio-dev'] but are missing. Can be fixed by: `apt install libaio-dev`. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io requires the libraries: ['libaio-dev'] but are missing. Can be fixed by: `apt install libaio-dev`. [WARNING]  async_io requires the libraries: ['libaio-dev'] but are missing. Can be fixed by: `apt install libaio-dev`. - -async_ioasync_io .............................. [NO][NO] .............. [NO][NO] - -transformer_inference ..transformer_inference [NO].. .......[NO] [OKAY]....... - [OKAY] -utils ..................utils [YES] ........................ [YES][OKAY] -...... [OKAY] -quantizer .............. [NO]quantizer ..................... [OKAY][NO] - ....... [OKAY]-------------------------------------------------- - --------------------------------------------------- - [WARNING]  async_io requires the libraries: ['libaio-dev'] but are missing. Can be fixed by: `apt install libaio-dev`. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/tr1-13B/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr1-13B/DeepSpeed-big-science/deepspeed'] -deepspeed info ................... 0.4.2+bc17042, bc17042, big-science -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io requires the libraries: ['libaio-dev'] but are missing. Can be fixed by: `apt install libaio-dev`. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -/bin/sh: line 0: type: git: not found -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** - [WARNING]  async_io requires the libraries: ['libaio-dev'] but are missing. Can be fixed by: `apt install libaio-dev`. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io requires the libraries: ['libaio-dev'] but are missing. Can be fixed by: `apt install libaio-dev`. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed general environment info: -DeepSpeed general environment info: -DeepSpeed general environment info:torch install path ............... - torch install path ............... torch install path ...............['/gpfswork/rech/six/commun/conda/tr1-13B/lib/python3.8/site-packages/torch'] -['/gpfswork/rech/six/commun/conda/tr1-13B/lib/python3.8/site-packages/torch']torch version - ....................['/gpfswork/rech/six/commun/conda/tr1-13B/lib/python3.8/site-packages/torch'] -torch version1.8.1 -....................torch version torch cuda version 1.8.1 .................... -............... 1.8.1torch cuda version11.1 - -...............nvcc versiontorch cuda version 11.1 ............... -..................... nvcc version 11.1 11.2 -..................... -nvcc version deepspeed install path 11.2 ..................... -........... deepspeed install path 11.2 ['/gpfsssd/worksf/projects/rech/six/commun/code/tr1-13B/DeepSpeed-big-science/deepspeed'] -...........deepspeed install path - deepspeed info........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr1-13B/DeepSpeed-big-science/deepspeed'] ................... - ['/gpfsssd/worksf/projects/rech/six/commun/code/tr1-13B/DeepSpeed-big-science/deepspeed']deepspeed info0.4.2+bc17042, bc17042, big-science - -deepspeed info...................deepspeed wheel compiled w. .........................0.4.2+bc17042, bc17042, big-science -0.4.2+bc17042, bc17042, big-sciencetorch 1.8, cuda 11.1deepspeed wheel compiled w. - - deepspeed wheel compiled w....... ......torch 1.8, cuda 11.1 -torch 1.8, cuda 11.1 -/bin/sh: line 0: type: git: not found -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown ******** Git info for Megatron: git_hash=unknown git_branch=unknown **** - -/bin/sh: line 0: type: git: not found -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown ******** Git info for Megatron: git_hash=unknown git_branch=unknown **** - -DeepSpeed general environment info:DeepSpeed general environment info: - -torch install pathtorch install path .............................. ['/gpfswork/rech/six/commun/conda/tr1-13B/lib/python3.8/site-packages/torch']['/gpfswork/rech/six/commun/conda/tr1-13B/lib/python3.8/site-packages/torch'] - -torch versiontorch version ........................................ 1.8.11.8.1 - -torch cuda versiontorch cuda version .............................. 11.111.1 - -nvcc versionnvcc version .......................................... 11.211.2 - -deepspeed install pathdeepspeed install path ...................... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr1-13B/DeepSpeed-big-science/deepspeed']['/gpfsssd/worksf/projects/rech/six/commun/code/tr1-13B/DeepSpeed-big-science/deepspeed'] - -deepspeed infodeepspeed info ...................................... 0.4.2+bc17042, bc17042, big-science0.4.2+bc17042, bc17042, big-science - -deepspeed wheel compiled w.deepspeed wheel compiled w. ............ torch 1.8, cuda 11.1torch 1.8, cuda 11.1 - -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/tr1-13B/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr1-13B/DeepSpeed-big-science/deepspeed'] -deepspeed info ................... 0.4.2+bc17042, bc17042, big-science -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -/bin/sh: line 0: type: git: not found -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -DeepSpeed general environment info:DeepSpeed general environment info: - -torch install path torch install path............... ............... ['/gpfswork/rech/six/commun/conda/tr1-13B/lib/python3.8/site-packages/torch']['/gpfswork/rech/six/commun/conda/tr1-13B/lib/python3.8/site-packages/torch'] - -torch versiontorch version ........................................ 1.8.1 -1.8.1 -torch cuda versiontorch cuda version .............................. 11.111.1 - -nvcc versionnvcc version .......................................... 11.211.2 - -deepspeed install pathdeepspeed install path ...................... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr1-13B/DeepSpeed-big-science/deepspeed']['/gpfsssd/worksf/projects/rech/six/commun/code/tr1-13B/DeepSpeed-big-science/deepspeed'] - -deepspeed infodeepspeed info ...................................... 0.4.2+bc17042, bc17042, big-science0.4.2+bc17042, bc17042, big-science - -deepspeed wheel compiled w.deepspeed wheel compiled w. ............ torch 1.8, cuda 11.1torch 1.8, cuda 11.1 - --------------------------------------------------- ---------------------------------------------------DeepSpeed C++/CUDA extension op report - --------------------------------------------------- -DeepSpeed C++/CUDA extension op report -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. ------------------------------------------------------------------------------------------------------------------------------------------------------- - - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.JIT compiled ops requires ninjaDeepSpeed C++/CUDA extension op report - --------------------------------------------------- --------------------------------------------------- - ---------------------------------------------------JIT compiled ops requires ninjaDeepSpeed C++/CUDA extension op report - - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.-------------------------------------------------- - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.-------------------------------------------------- - ---------------------------------------------------JIT compiled ops requires ninja - -JIT compiled ops requires ninja --------------------------------------------------- -DeepSpeed C++/CUDA extension op report ----------------------------------------------------------------------------------------------------- --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.DeepSpeed C++/CUDA extension op report - - -----------------------------------------------------------------------------------------------------DeepSpeed C++/CUDA extension op report - --------------------------------------------------- -JIT compiled ops requires ninjaNOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- - -DeepSpeed C++/CUDA extension op report - ---------------------------------------------------NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja - ---------------------------------------------------NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - -JIT compiled ops requires ninja-------------------------------------------------- - -JIT compiled ops requires ninja -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/tr1-13B/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr1-13B/DeepSpeed-big-science/deepspeed'] -deepspeed info ................... 0.4.2+bc17042, bc17042, big-science -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -ninjaninjaninjaninja ...................................................... .................. [OKAY][OKAY] -[OKAY][OKAY] - --------------------------------------------------- ----------------------------------------------------------------------------------------------------- - --------------------------------------------------- -op nameop name - op nameop name ................................ ................................ installed installed installedinstalled .. .. ..compatible.. -ninjaninjaninjaninja ........................................................................ [OKAY][OKAY][OKAY][OKAY] - - - - --------------------------------------------------compatiblecompatiblecompatible - - - --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- - - - ----------------------------------------------------------------------------------------------------- --------------------------------------------------- - -op nameop name op nameop name ................................ ................installed................installed installed.. installed.... compatiblecompatible -..compatible --------------------------------------------------- - --------------------------------------------------- --------------------------------------------------compatible - - --------------------------------------------------- -cpu_adam ............... [YES]cpu_adam cpu_adam...... cpu_adam[OKAY] ............... -cpu_adam ............... [YES]cpu_adam cpu_adamcpu_adam...... ...............[OKAY]............... ............... -............... ............... [YES] [YES] [YES] ...... ............ [OKAY]fused_adam[OKAY][OKAY] - - - [YES][YES][YES] ............ ......[OKAY] -[OKAY][OKAY]fused_adam - -............. [NO] ....... [OKAY] - ............. [NO] ....... [OKAY]fused_adam -fused_adamfused_lamb fused_adam............. fused_adam ............. .............[NO].............[NO] .......[NO][NO]....... [OKAY] -.......[OKAY] -.......[OKAY]fused_lamb - [OKAY]............. - fused_adam.............fused_adam ..........................fused_lamb[NO] [NO].......[NO]............. .......[OKAY]....... -[NO] [OKAY][OKAY]fused_lamb....... - - fused_lamb[NO] ....................fused_lamb sparse_attn [OKAY][NO]............. - .............fused_lamb [OKAY]fused_lamb[NO] -............ .......[NO][NO] [OKAY].............. - .......................... ....... [NO] [NO] [OKAY] ....... - [OKAY][OKAY] - -sparse_attntransformer ........................ [NO][NO] .............. [OKAY]sparse_attn[OKAY] - - .......[OKAY] -[OKAY]sparse_attn - ............ [NO] ....... [OKAY] - transformersparse_attn............ stochastic_transformer............ ............ [NO].[NO] [NO] [NO] ............................ [OKAY][OKAY][OKAY][OKAY] - -sparse_attn transformersparse_attn ............ ............ sparse_attn............ [NO][NO] ............ [NO]....... ....... [NO][OKAY].......[OKAY] - - - -[OKAY]....... transformer -[OKAY]stochastic_transformer -transformerstochastic_transformer transformer............ . ............ [NO] [NO] [NO] .............. .......[OKAY][OKAY] - -[OKAY] - ............transformer [NO].transformer ............ [NO] ....... ............ ....... [NO] [OKAY] [OKAY][NO] -....... -stochastic_transformer .stochastic_transformer [NO] ........ [OKAY][NO] - ....... [OKAY] - stochastic_transformer.......[OKAY] -[OKAY]. - stochastic_transformer[NO] stochastic_transformer....... . [OKAY] [NO]. - .......[NO] [OKAY]....... - [OKAY] -DeepSpeed general environment info:DeepSpeed general environment info: - -torch install pathtorch install path .............................. ['/gpfswork/rech/six/commun/conda/tr1-13B/lib/python3.8/site-packages/torch']['/gpfswork/rech/six/commun/conda/tr1-13B/lib/python3.8/site-packages/torch'] - -torch versiontorch version ........................................ 1.8.11.8.1 - -torch cuda versiontorch cuda version .............................. 11.111.1 - -nvcc versionnvcc version .......................................... 11.211.2 - -deepspeed install pathdeepspeed install path ...................... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr1-13B/DeepSpeed-big-science/deepspeed']['/gpfsssd/worksf/projects/rech/six/commun/code/tr1-13B/DeepSpeed-big-science/deepspeed'] - -deepspeed infodeepspeed info ...................................... 0.4.2+bc17042, bc17042, big-science0.4.2+bc17042, bc17042, big-science - -deepspeed wheel compiled w.deepspeed wheel compiled w. ............ torch 1.8, cuda 11.1torch 1.8, cuda 11.1 - -DeepSpeed general environment info: -torch install pathDeepSpeed general environment info: ............... -torch install path['/gpfswork/rech/six/commun/conda/tr1-13B/lib/python3.8/site-packages/torch'] -............... torch version .................... 1.8.1 -['/gpfswork/rech/six/commun/conda/tr1-13B/lib/python3.8/site-packages/torch'] -torch cuda version ...............torch version 11.1.................... - 1.8.1nvcc version - ..................... torch cuda version11.2 -...............deepspeed install path 11.1........... - nvcc version ['/gpfsssd/worksf/projects/rech/six/commun/code/tr1-13B/DeepSpeed-big-science/deepspeed']..................... - 11.2deepspeed info - deepspeed install path................... ...........0.4.2+bc17042, bc17042, big-science -['/gpfsssd/worksf/projects/rech/six/commun/code/tr1-13B/DeepSpeed-big-science/deepspeed']deepspeed wheel compiled w. - ......deepspeed info torch 1.8, cuda 11.1................... - 0.4.2+bc17042, bc17042, big-science -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 --------------------------------------------------- -DeepSpeed C++/CUDA extension op report ------------------------------------------------------------------------------------------------------------------------------------------------------- - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.DeepSpeed C++/CUDA extension op report - - -DeepSpeed C++/CUDA extension op report------------------------------------------------------------------------------------------------------------------------------------------------------ - - - ---------------------------------------------------DeepSpeed C++/CUDA extension op reportNOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.JIT compiled ops requires ninja - - - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.---------------------------------------------------------------------------------------------------- - - ---------------------------------------------------JIT compiled ops requires ninjaNOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - - -JIT compiled ops requires ninja --------------------------------------------------- -JIT compiled ops requires ninja -/bin/sh: line 0: type: git: not found -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/tr1-13B/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr1-13B/DeepSpeed-big-science/deepspeed'] -deepspeed info ................... 0.4.2+bc17042, bc17042, big-science -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -ninjaninjaninja ninja...................................................... .................. [OKAY][OKAY][OKAY] -[OKAY] - - ----------------------------------------------------------------------------------------------------- ----------------------------------------------------------------------------------------------------- - - -op name op name................op nameop name ................................................installed installed..installedinstalled ..compatible.... - compatible--------------------------------------------------compatible - - -compatible---------------------------------------------------------------------------------------------------- - - --------------------------------------------------- -cpu_adam ............... [YES] cpu_adam...... cpu_adam...............[OKAY] cpu_adam -[YES]............... .....................[YES] [OKAY]......[YES] - [OKAY]fused_adam...... - ............. [OKAY][NO]fused_adam - .................... fused_adam [OKAY] [NO] -............. .......[NO] [OKAY]fused_lamb....... -fused_adam .............fused_lamb [OKAY] .......................... -[NO] [NO]fused_lamb.......[NO] ..............[OKAY]............. - [OKAY][NO] - [OKAY]....... -[OKAY] -fused_lamb .............sparse_attn [NO]............ sparse_attn[NO]....... ...................sparse_attn [OKAY][NO]............[OKAY] - -[NO]....... transformer.......[OKAY] -............[OKAY] -[NO]transformer transformer...................sparse_attn ............[OKAY][NO] -............[NO] ....... [NO].......stochastic_transformer[OKAY] -[OKAY]....... -. stochastic_transformer [OKAY]stochastic_transformer [NO] - ......... [NO][NO][OKAY] transformer....... -............ ....... [OKAY] [NO] -[OKAY] -....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/tr1-13B/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr1-13B/DeepSpeed-big-science/deepspeed'] -deepspeed info ................... 0.4.2+bc17042, bc17042, big-science -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 --------------------------------------------------- ---------------------------------------------------DeepSpeed C++/CUDA extension op report---------------------------------------------------------------------------------------------------- - - - ---------------------------------------------------DeepSpeed C++/CUDA extension op reportDeepSpeed C++/CUDA extension op report - -DeepSpeed C++/CUDA extension op report-------------------------------------------------- - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- ---------------------------------------------------NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- - - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.-------------------------------------------------- -JIT compiled ops requires ninja - --------------------------------------------------- -JIT compiled ops requires ninja-------------------------------------------------- - - -JIT compiled ops requires ninjaJIT compiled ops requires ninja - -ninjaninjaninjaninja ...................................................... .................. [OKAY] [OKAY][OKAY] - -[OKAY] - ------------------------------------------------------------------------------------------------------------------------------------------------------- - --------------------------------------------------- -op name -op nameop name ................op name................................ installedinstalled................installed ......installed compatible compatiblecompatible.. - - - ------------------------------------------------------------------------------------------------------------------------------------------------------compatible - - - --------------------------------------------------- -cpu_adam cpu_adam...............cpu_adam cpu_adam[YES] ............... .....................[YES]............... [OKAY][YES]......[YES] - [OKAY]............ - [OKAY] -[OKAY] -fused_adam ............. [NO]fused_adam .................... [OKAY]fused_adamfused_adam[NO] - .................... .............fused_lamb[OKAY][NO] - [NO] ............. ....... fused_lamb....... [NO] [OKAY] .............[OKAY]....... - - [NO][OKAY] -.......fused_lamb fused_lamb [OKAY] ............. -............. [NO][NO] .............. [OKAY]sparse_attn[OKAY] - -............ [NO] .......sparse_attn [OKAY]............ - [NO] transformer....... ............[OKAY] sparse_attnsparse_attn - [NO] ............................... transformer [NO] [NO][OKAY] ............ -....... ....... [NO] [OKAY][OKAY]....... -stochastic_transformer - [OKAY]transformer -. transformer ............[NO] stochastic_transformer [NO]............ ....... . .......[NO] [OKAY] [NO] -[OKAY] ....... -....... [OKAY][OKAY]stochastic_transformer - - . [NO]stochastic_transformer ....... .[OKAY] - [NO] ....... [OKAY] -/bin/sh: line 0: type: git: not found -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. ----------------------------------------------------------------------------------------------------- -JIT compiled ops requires ninja-------------------------------------------------- - - --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -DeepSpeed C++/CUDA extension op report -DeepSpeed C++/CUDA extension op report ----------------------------------------------------------------------------------------------------- - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - ----------------------------------------------------------------------------------------------------- - -JIT compiled ops requires ninja -JIT compiled ops requires ninja -JIT compiled ops requires ninja ------------------------------------------------------------------------------------------------------------------------------------------------------- - -DeepSpeed C++/CUDA extension op report -DeepSpeed C++/CUDA extension op report - -DeepSpeed C++/CUDA extension op report---------------------------------------------------------------------------------------------------- - - ---------------------------------------------------NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.---------------------------------------------------------------------------------------------------- - - ---------------------------------------------------JIT compiled ops requires ninjaJIT compiled ops requires ninja - - -JIT compiled ops requires ninja -ninjaninjaninjaninja ........................................................................ [OKAY][OKAY] -[OKAY] -[OKAY]-------------------------------------------------- - --------------------------------------------------- - -----------------------------------------------------------------------------------------------------op name - -op name op name op name................ ................ ................ ................installedinstalled installed installed .. ....compatible ..compatible - -compatible -------------------------------------------------- ---------------------------------------------------compatible - --------------------------------------------------- - --------------------------------------------------- -cpu_adam cpu_adam...............cpu_adam cpu_adam...............[YES] ...............[YES]...... ............... ......[YES] [OKAY] [YES] -[OKAY]...... -...... [OKAY][OKAY] - -fused_adam ............. fused_adam[NO] fused_adamfused_adam .................... .............[OKAY][NO]............. - [NO].......[NO] fused_lamb ....... [OKAY]....... ............. -ninjaninjaninjaninja ........................................................................ [OKAY] - [OKAY] [OKAY]fused_lamb - -[NO] .................... fused_lambfused_lamb [NO][OKAY]............. -[OKAY][OKAY][OKAY] --------------------------------------------------- - - ----------------------------------------------------------------------------------------------------- -op name-------------------------------------------------- - ....................[NO] [OKAY] [NO] -....... .......[OKAY] - op name -[OKAY] - ................op nameop name................ installed................installed................ installed.... installed ..compatiblecompatible .. - -compatible ----------------------------------------------------------------------------------------------------compatible - -sparse_attn ............ sparse_attnsparse_attn[NO]sparse_attn ............ ................... ............ [NO][NO] [OKAY] - - --------------------------------------------------- --------------------------------------------------- -[NO].............. .......transformer[OKAY] [OKAY] - -............[OKAY] transformer -cpu_adam ...............cpu_adam cpu_adam [YES]cpu_adam ............... .....................[YES]............... [YES] [OKAY][YES] ...... -transformer[NO] ...............................transformer [NO][NO]............[OKAY] -...... [OKAY]......[OKAY] - - [OKAY] -..............[NO] [OKAY][OKAY]....... -stochastic_transformer - [OKAY] -fused_adam ............. [NO] .......fused_adam fused_adam fused_adam[OKAY] ............. -.stochastic_transformer stochastic_transformer [NO] stochastic_transformer . ....... . [NO] . [OKAY][NO]....... - [NO][OKAY]....... -............. [NO].............fused_lamb[NO] .......[NO].................... .......[OKAY][OKAY][NO] - - [OKAY]....... - fused_lamb[OKAY] fused_lambfused_lamb............. - .......[OKAY] -[OKAY] - [NO].......................... .......[NO][NO] [OKAY]....... - .......sparse_attn[OKAY] -[OKAY]............ - [NO] ....... [OKAY] -transformer ............ [NO]sparse_attn sparse_attn ............................... sparse_attn [OKAY][NO] [NO] - ............ ....... ....... stochastic_transformer[NO] [OKAY][OKAY] -. -....... transformer[OKAY][NO]transformer - ...................transformer............ [OKAY][NO] -[NO]............ ..............[NO] [OKAY] [OKAY] - -....... [OKAY] -stochastic_transformer stochastic_transformer . stochastic_transformer.[NO] [NO]........ [OKAY].......[NO] - [OKAY]....... - [OKAY] -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -/bin/sh: line 0: type: git: not found -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown ******** Git info for Megatron: git_hash=unknown git_branch=unknown **** - - [WARNING]  async_io requires the libraries: ['libaio-dev'] but are missing. Can be fixed by: `apt install libaio-dev`. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io requires the libraries: ['libaio-dev'] but are missing. Can be fixed by: `apt install libaio-dev`. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja ----------------------------------------------------------------------------------------------------- -DeepSpeed C++/CUDA extension op report - -DeepSpeed C++/CUDA extension op report-------------------------------------------------- - ---------------------------------------------------NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.-------------------------------------------------- - -JIT compiled ops requires ninja-------------------------------------------------- - ---------------------------------------------------JIT compiled ops requires ninja - -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja - [WARNING]  async_io requires the libraries: ['libaio-dev'] but are missing. Can be fixed by: `apt install libaio-dev`. [WARNING]  async_io requires the libraries: ['libaio-dev'] but are missing. Can be fixed by: `apt install libaio-dev`. - -async_ioasync_io .............................. [NO][NO] .............. [NO][NO] - -transformer_inferencetransformer_inference .... [NO][NO] .............. [OKAY][OKAY] - -utils ..................utils [YES].................. ......[YES] [OKAY]...... - [OKAY] -quantizer ..............quantizer [NO].............. .......[NO] [OKAY]....... - [OKAY] ----------------------------------------------------------------------------------------------------- - -ninjaninjaninjaninja ...................................................... .................. [OKAY][OKAY][OKAY][OKAY] - - - ----------------------------------------------------------------------------------------------------- --------------------------------------------------- --------------------------------------------------- -op name -op name op name ................ op name................ ................ installed installed installed .................. .. ..compatible installed -compatible compatible--------------------------------------------------.. - --------------------------------------------------- --------------------------------------------------- - -compatible --------------------------------------------------- -cpu_adam ...............cpu_adam cpu_adam[YES]............... ...............cpu_adam......[YES] [YES]............... [OKAY] ...... -......[YES] [OKAY] [OKAY] -...... - [OKAY] -fused_adam ............. [NO] ....... fused_adam[OKAY]fused_adam -fused_adam .......................... fused_lamb .............[NO][NO] [NO] ............. .....................[NO] [OKAY][OKAY][OKAY]....... - - -[OKAY]fused_lamb -fused_lamb fused_lamb ............. ............. ............. [NO] [NO][NO]....... .......sparse_attn.......[OKAY] -[OKAY]............[OKAY] - -[NO] ....... [OKAY] -transformer ............ [NO] ....... sparse_attn[OKAY]sparse_attn sparse_attn - .................................... stochastic_transformer [NO] [NO] [NO]....... ........ ....... [OKAY][NO] [OKAY] -[OKAY] - -.......transformer transformer............transformer [OKAY] ............ -[NO] ............ [NO] ....... [NO] ....... [OKAY] ....... -[OKAY] -[OKAY] -stochastic_transformerstochastic_transformer stochastic_transformer .. . [NO] [NO] [NO]....... ..............[OKAY] -[OKAY][OKAY] - -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** - [WARNING]  async_io requires the libraries: ['libaio-dev'] but are missing. Can be fixed by: `apt install libaio-dev`. [WARNING]  async_io requires the libraries: ['libaio-dev'] but are missing. Can be fixed by: `apt install libaio-dev`. - -async_ioasync_io .............................. [NO][NO] .............. [NO][NO] - -transformer_inference .. [NO] transformer_inference....... ..[OKAY] -[NO] ....... [OKAY] -utils .................. [YES] ...... utils[OKAY] -.................. quantizer[YES] .................... [NO][OKAY] -....... [OKAY] -quantizer .............. --------------------------------------------------[NO] - ....... [OKAY] --------------------------------------------------- -/bin/sh: line 0: type: git: not found -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** --------------------------------------------------- ---------------------------------------------------DeepSpeed C++/CUDA extension op report-------------------------------------------------- - ---------------------------------------------------DeepSpeed C++/CUDA extension op report - - -DeepSpeed C++/CUDA extension op reportNOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.-------------------------------------------------- --------------------------------------------------- - --------------------------------------------------- ---------------------------------------------------DeepSpeed C++/CUDA extension op report - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.JIT compiled ops requires ninja-------------------------------------------------- - - - ---------------------------------------------------NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja --------------------------------------------------- - -JIT compiled ops requires ninjaJIT compiled ops requires ninja - - [WARNING]  async_io requires the libraries: ['libaio-dev'] but are missing. Can be fixed by: `apt install libaio-dev`. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io requires the libraries: ['libaio-dev'] but are missing. Can be fixed by: `apt install libaio-dev`. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io requires the libraries: ['libaio-dev'] but are missing. Can be fixed by: `apt install libaio-dev`. [WARNING]  async_io requires the libraries: ['libaio-dev'] but are missing. Can be fixed by: `apt install libaio-dev`. - -async_io ............... async_io[NO] ...................... [NO][NO] - ....... [NO] -transformer_inference .. transformer_inference[NO] ......... [NO][OKAY] -....... [OKAY] -utils ..................utils [YES].................. ......[YES] [OKAY]...... - [OKAY] -quantizer .............. [NO]quantizer ..................... [OKAY][NO] - ....... [OKAY] --------------------------------------------------- --------------------------------------------------- - [WARNING]  async_io requires the libraries: ['libaio-dev'] but are missing. Can be fixed by: `apt install libaio-dev`. - [WARNING]  async_io requires the libraries: ['libaio-dev'] but are missing. Can be fixed by: `apt install libaio-dev`. -async_io ............... [NO] ....... [NO] -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -transformer_inference .. [NO] utils....... ..................[OKAY] -[YES] ...... [OKAY]utils - .................. quantizer[YES] .................... [NO][OKAY] -....... [OKAY] -quantizer .............. --------------------------------------------------[NO] - ....... [OKAY] --------------------------------------------------- -ninjaninjaninjaninja ........................................................................ [OKAY] [OKAY] -[OKAY][OKAY] - --------------------------------------------------- --------------------------------------------------- ----------------------------------------------------------------------------------------------------- -op name - - op name................op name op name................ installed ................ installed ................installed.. .. installed..compatible -compatiblecompatible--------------------------------------------------.. - - - ----------------------------------------------------------------------------------------------------compatible - - --------------------------------------------------- -cpu_adam ............... [YES] ......cpu_adam cpu_adamcpu_adam[OKAY] .............................. -............... [YES] [YES] [YES] ...... ............ [OKAY] fused_adam -[OKAY][OKAY] - -............. [NO] ....... [OKAY] -fused_adamfused_lambfused_adam fused_adam ............. ............. .......................... [NO] [NO] [NO].......[NO]....... .......[OKAY][OKAY]....... - - [OKAY][OKAY] - -fused_lamb ............. fused_lambfused_lamb[NO] ................................. [NO] sparse_attn [OKAY] [NO]....... -............ .......[NO][OKAY] -.......[OKAY] -[OKAY] -transformer ............ [NO] sparse_attn....... ............[OKAY] -sparse_attn[NO]sparse_attn stochastic_transformer............................... [NO].[OKAY] -[NO] ....... [NO] .......transformer[OKAY]....... -............[OKAY] transformer -[OKAY] [NO] -transformer ................... ............[OKAY] [NO] -[NO] .............. stochastic_transformer[OKAY] -[OKAY] -. [NO] stochastic_transformer.......stochastic_transformer [OKAY] -.. [NO][NO] .............. [OKAY][OKAY] - --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja --------------------------------------------------- ---------------------------------------------------DeepSpeed C++/CUDA extension op report - --------------------------------------------------- -DeepSpeed C++/CUDA extension op reportNOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - ----------------------------------------------------------------------------------------------------- --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.JIT compiled ops requires ninja - - -DeepSpeed C++/CUDA extension op report-------------------------------------------------- - --------------------------------------------------- -JIT compiled ops requires ninjaNOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - --------------------------------------------------- -JIT compiled ops requires ninja -ninjaninjaninjaninja .................. .................................... .................. [OKAY][OKAY] - -[OKAY][OKAY]---------------------------------------------------------------------------------------------------- - - - -op name-------------------------------------------------- --------------------------------------------------op name -................ - op name................op nameinstalled ..................................installed installedcompatible.. installed - --------------------------------------------------....compatible - -compatiblecompatible - --------------------------------------------------- -----------------------------------------------------------------------------------------------------cpu_adam - - ............... [YES] ...... [OKAY] -cpu_adam ...............cpu_adamcpu_adam [YES].............................. ......[YES][YES] fused_adam ...... [OKAY]...... -.............[OKAY] -[OKAY][NO] -....... [OKAY] -fused_adamfused_lamb .............fused_adam............. fused_adam[NO][NO]............. ....................[NO] ....... [OKAY] -[NO].......[OKAY] -.......[OKAY] -[OKAY] -fused_lamb .............fused_lamb fused_lamb [NO]............. sparse_attn [NO]................................ .......[NO][NO][OKAY] -.......[OKAY]....... - [OKAY][OKAY] - -transformer ............ [NO] sparse_attn....... sparse_attn............[OKAY] - ............sparse_attn[NO] stochastic_transformer [NO]............ ....... ........ [NO] [OKAY][NO][OKAY] - -.............. transformertransformer [OKAY][OKAY] - ............ -............ [NO]transformer [NO] ....... ............ ....... [OKAY][NO][OKAY] - -....... [OKAY] -stochastic_transformer stochastic_transformer .stochastic_transformer. [NO][NO]. .......[NO]....... .......[OKAY][OKAY] - -[OKAY] - [WARNING]  async_io requires the libraries: ['libaio-dev'] but are missing. Can be fixed by: `apt install libaio-dev`. [WARNING]  async_io requires the libraries: ['libaio-dev'] but are missing. Can be fixed by: `apt install libaio-dev`. - -async_io async_io............... ...............[NO] [NO]....... .......[NO] -[NO] -transformer_inference transformer_inference.. ..[NO] [NO]....... .......[OKAY] -[OKAY] -utils .................. [YES]utils ........................ [OKAY][YES] - ...... quantizer[OKAY] -.............. [NO] .......quantizer [OKAY].............. - [NO] ....... --------------------------------------------------[OKAY] - --------------------------------------------------- - [WARNING]  async_io requires the libraries: ['libaio-dev'] but are missing. Can be fixed by: `apt install libaio-dev`. [WARNING]  async_io requires the libraries: ['libaio-dev'] but are missing. Can be fixed by: `apt install libaio-dev`. - -async_ioasync_io .............................. [NO][NO] .............. [NO][NO] - -transformer_inferencetransformer_inference .... [NO][NO] .............. [OKAY][OKAY] - -utilsutils .................................... [YES][YES] ............ [OKAY] -[OKAY] -quantizer ..............quantizer [NO] ..................... [NO][OKAY] -....... [OKAY] --------------------------------------------------- --------------------------------------------------- - [WARNING]  async_io requires the libraries: ['libaio-dev'] but are missing. Can be fixed by: `apt install libaio-dev`. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io requires the libraries: ['libaio-dev'] but are missing. Can be fixed by: `apt install libaio-dev`. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io requires the libraries: ['libaio-dev'] but are missing. Can be fixed by: `apt install libaio-dev`. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io requires the libraries: ['libaio-dev'] but are missing. Can be fixed by: `apt install libaio-dev`. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io requires the libraries: ['libaio-dev'] but are missing. Can be fixed by: `apt install libaio-dev`. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/tr1-13B/lib/python3.8/site-packages/torch']DeepSpeed general environment info: -torch version - .................... 1.8.1 -torch install pathtorch cuda version .............................. 11.1 -nvcc version ..................... 11.2['/gpfswork/rech/six/commun/conda/tr1-13B/lib/python3.8/site-packages/torch'] - -deepspeed install path ...........torch version ['/gpfsssd/worksf/projects/rech/six/commun/code/tr1-13B/DeepSpeed-big-science/deepspeed'].................... - deepspeed info1.8.1 -................... 0.4.2+bc17042, bc17042, big-sciencetorch cuda version - ...............deepspeed wheel compiled w. ......11.1 -torch 1.8, cuda 11.1nvcc version - ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr1-13B/DeepSpeed-big-science/deepspeed'] -deepspeed info ................... 0.4.2+bc17042, bc17042, big-science -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- ---------------------------------------------------NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - -DeepSpeed C++/CUDA extension op report---------------------------------------------------------------------------------------------------- - - -JIT compiled ops requires ninja--------------------------------------------------DeepSpeed C++/CUDA extension op report - - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.-------------------------------------------------- - --------------------------------------------------- ---------------------------------------------------NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.JIT compiled ops requires ninja - - -DeepSpeed C++/CUDA extension op report-------------------------------------------------- - ---------------------------------------------------JIT compiled ops requires ninja - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja -ninjaninjaninjaninja ........................................................................ [OKAY] [OKAY] -[OKAY][OKAY] --------------------------------------------------- - --------------------------------------------------- - ---------------------------------------------------op name-------------------------------------------------- -op name - ................op name................op name installedinstalled................................ .. ..installed installed compatible -..compatible..-------------------------------------------------- - compatible ---------------------------------------------------compatible - --------------------------------------------------- - --------------------------------------------------- -cpu_adam ............... [YES] ...... [OKAY]cpu_adam -cpu_adamcpu_adam .............................. ............... [YES] [YES] [YES]...... fused_adam ...... ......[OKAY][OKAY] ............. - - [OKAY][NO] - ....... [OKAY] -fused_lambfused_adam fused_adam .......................... fused_adam .............[NO] [NO] .................... [NO] ....... [NO][OKAY] ....... - [OKAY] -.......[OKAY] -fused_lamb[OKAY] - fused_lamb............. fused_lamb.............[NO] .............sparse_attn[NO]....... [OKAY]...................[NO] - [NO]....... [OKAY]....... - [OKAY][OKAY] - -transformer ............sparse_attn [NO]............ .......[NO] sparse_attn [OKAY]....... - sparse_attn ............[OKAY] -stochastic_transformer[NO]............ transformer........ [NO] ............[NO] [OKAY] .......[NO]....... - transformer.......[OKAY] [OKAY] - ............ -[OKAY] transformer -[NO] ................... [NO][OKAY] stochastic_transformer -....... [OKAY]. -stochastic_transformer [NO] stochastic_transformer........ [OKAY][NO]. - .......[NO] [OKAY]....... - [OKAY] - [WARNING]  async_io requires the libraries: ['libaio-dev'] but are missing. Can be fixed by: `apt install libaio-dev`. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed general environment info:DeepSpeed general environment info: - -torch install pathtorch install path .............................. ['/gpfswork/rech/six/commun/conda/tr1-13B/lib/python3.8/site-packages/torch']['/gpfswork/rech/six/commun/conda/tr1-13B/lib/python3.8/site-packages/torch'] - -torch versiontorch version ........................................ 1.8.11.8.1 - -torch cuda versiontorch cuda version .............................. 11.111.1 - -nvcc versionnvcc version .......................................... 11.211.2 - -deepspeed install pathdeepspeed install path ...................... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr1-13B/DeepSpeed-big-science/deepspeed']['/gpfsssd/worksf/projects/rech/six/commun/code/tr1-13B/DeepSpeed-big-science/deepspeed'] - -deepspeed infodeepspeed info ...................................... 0.4.2+bc17042, bc17042, big-science0.4.2+bc17042, bc17042, big-science - -deepspeed wheel compiled w.deepspeed wheel compiled w. ............ torch 1.8, cuda 11.1torch 1.8, cuda 11.1 - -DeepSpeed general environment info:DeepSpeed general environment info: - -torch install pathtorch install path .............................. ['/gpfswork/rech/six/commun/conda/tr1-13B/lib/python3.8/site-packages/torch']['/gpfswork/rech/six/commun/conda/tr1-13B/lib/python3.8/site-packages/torch'] - -torch versiontorch version ........................................ 1.8.11.8.1 - -torch cuda versiontorch cuda version .............................. 11.111.1 - -nvcc versionnvcc version .......................................... 11.211.2 - -deepspeed install pathdeepspeed install path ...................... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr1-13B/DeepSpeed-big-science/deepspeed']['/gpfsssd/worksf/projects/rech/six/commun/code/tr1-13B/DeepSpeed-big-science/deepspeed'] - -deepspeed infodeepspeed info ...................................... 0.4.2+bc17042, bc17042, big-science -0.4.2+bc17042, bc17042, big-sciencedeepspeed wheel compiled w. - ......deepspeed wheel compiled w. torch 1.8, cuda 11.1...... - torch 1.8, cuda 11.1 - [WARNING]  async_io requires the libraries: ['libaio-dev'] but are missing. Can be fixed by: `apt install libaio-dev`. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io requires the libraries: ['libaio-dev'] but are missing. Can be fixed by: `apt install libaio-dev`. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io requires the libraries: ['libaio-dev'] but are missing. Can be fixed by: `apt install libaio-dev`. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io requires the libraries: ['libaio-dev'] but are missing. Can be fixed by: `apt install libaio-dev`. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/tr1-13B/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr1-13B/DeepSpeed-big-science/deepspeed'] -deepspeed info ................... 0.4.2+bc17042, bc17042, big-science -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io requires the libraries: ['libaio-dev'] but are missing. Can be fixed by: `apt install libaio-dev`. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io requires the libraries: ['libaio-dev'] but are missing. Can be fixed by: `apt install libaio-dev`. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io requires the libraries: ['libaio-dev'] but are missing. Can be fixed by: `apt install libaio-dev`. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- --------------------------------------------------- ---------------------------------------------------DeepSpeed C++/CUDA extension op report - ---------------------------------------------------DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -DeepSpeed C++/CUDA extension op report-------------------------------------------------- - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja --------------------------------------------------- - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.JIT compiled ops requires ninja - ----------------------------------------------------------------------------------------------------- -JIT compiled ops requires ninja - -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/tr1-13B/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr1-13B/DeepSpeed-big-science/deepspeed'] -deepspeed info ................... 0.4.2+bc17042, bc17042, big-science -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/tr1-13B/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr1-13B/DeepSpeed-big-science/deepspeed'] -deepspeed info ................... 0.4.2+bc17042, bc17042, big-science -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1DeepSpeed general environment info: - -torch install path ............... ['/gpfswork/rech/six/commun/conda/tr1-13B/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr1-13B/DeepSpeed-big-science/deepspeed'] -deepspeed info ................... 0.4.2+bc17042, bc17042, big-science -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info:DeepSpeed general environment info: - -torch install pathtorch install path .............................. ['/gpfswork/rech/six/commun/conda/tr1-13B/lib/python3.8/site-packages/torch']['/gpfswork/rech/six/commun/conda/tr1-13B/lib/python3.8/site-packages/torch'] - -torch versiontorch version ........................................ 1.8.11.8.1 - -torch cuda versiontorch cuda version .............................. 11.111.1 - -ninjaninjaninjaninja ...................................................... ..................[OKAY][OKAY][OKAY] - - -nvcc versionnvcc version .......................................... 11.211.2 - -[OKAY]------------------------------------------------------------------------------------------------------------------------------------------------------ - - - ---------------------------------------------------op nameop nameop name -deepspeed install pathdeepspeed install path ...................... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr1-13B/DeepSpeed-big-science/deepspeed']['/gpfsssd/worksf/projects/rech/six/commun/code/tr1-13B/DeepSpeed-big-science/deepspeed'] - -deepspeed infodeepspeed info ...................................... 0.4.2+bc17042, bc17042, big-science0.4.2+bc17042, bc17042, big-science - ................................op name ................ installedinstalledinstalled ................ .... .. installed compatiblecompatible ..compatible - - -----------------------------------------------------------------------------------------------------compatible - --------------------------------------------------- - - -deepspeed wheel compiled w.deepspeed wheel compiled w. ............ torch 1.8, cuda 11.1torch 1.8, cuda 11.1 - --------------------------------------------------- -cpu_adamcpu_adamcpu_adam ............... cpu_adam............... [YES]............... ............... [YES][YES] ............ [YES] [OKAY]......[OKAY] - -......[OKAY] -[OKAY] -fused_adamfused_adam .............fused_adam............. [NO] fused_adam....................[NO] [OKAY][NO] -.................... fused_lamb[OKAY].......[NO] -............. [OKAY]fused_lamb[NO]....... - .......[OKAY]............. - fused_lamb[NO] [OKAY]fused_lamb -.................... .............[NO][OKAY] -[NO] ....... .......[OKAY] -[OKAY] -sparse_attn ............ [NO] sparse_attn....... ............[OKAY] -[NO] sparse_attn.......transformer sparse_attn ............[OKAY] ............ -............ [NO] transformer[NO][NO] ....... ................... ....... [OKAY][OKAY] [NO] - -[OKAY] -.......transformer transformer[OKAY] ............ -stochastic_transformer............ [NO]stochastic_transformer[NO] . ....... ........ [NO] [OKAY][NO].......[OKAY] - -[OKAY]....... - [OKAY]stochastic_transformerstochastic_transformer - .. [NO][NO] .............. [OKAY][OKAY] - - [WARNING]  async_io requires the libraries: ['libaio-dev'] but are missing. Can be fixed by: `apt install libaio-dev`. [WARNING]  async_io requires the libraries: ['libaio-dev'] but are missing. Can be fixed by: `apt install libaio-dev`. - -async_io async_io............... [NO]............... .......[NO] [NO] -....... [NO] -transformer_inference .. [NO] transformer_inference....... ..[OKAY] -[NO] ....... [OKAY] -utils .................. [YES] ......utils [OKAY].................. - [YES] ...... quantizer[OKAY] -.............. [NO] ....... [OKAY]quantizer - .............. [NO] .......-------------------------------------------------- -[OKAY] --------------------------------------------------- ----------------------------------------------------------------------------------------------------- --------------------------------------------------- -DeepSpeed C++/CUDA extension op reportDeepSpeed C++/CUDA extension op report --------------------------------------------------- - -DeepSpeed C++/CUDA extension op report-------------------------------------------------- - --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.--------------------------------------------------DeepSpeed C++/CUDA extension op report - - - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.--------------------------------------------------NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- --------------------------------------------------- -JIT compiled ops requires ninja - - ---------------------------------------------------JIT compiled ops requires ninjaNOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - -JIT compiled ops requires ninja - --------------------------------------------------- -JIT compiled ops requires ninja -ninjaninjaninjaninja ........................................................................ [OKAY][OKAY][OKAY][OKAY] - - - ------------------------------------------------------------------------------------------------------------------------------------------------------- --------------------------------------------------- - - -op nameop nameop name op name................................ ................................installed installed installed.. installed .... compatible compatible -..compatible --------------------------------------------------- - - ---------------------------------------------------------------------------------------------------- -compatible - --------------------------------------------------- -cpu_adam ............... [YES] ......cpu_adam cpu_adam [OKAY] ...............cpu_adam............... - ...............[YES][YES] [YES]............ ......[OKAY][OKAY] -fused_adam - [OKAY]............. -[NO] .......fused_adam [OKAY]............. -fused_adam .............fused_adam [NO][NO] fused_lamb............. ....... ....................[NO][OKAY] - [OKAY][NO]....... - fused_lamb ....... [OKAY] .............fused_lamb -[OKAY] -.............fused_lamb[NO] [NO]....... ............. .......[OKAY][NO] - sparse_attn[OKAY]....... - ............[OKAY] [NO] - ....... [OKAY] -sparse_attntransformer ........................ sparse_attn [NO][NO] sparse_attn .......................... ............[OKAY][NO][OKAY] - -[NO].......stochastic_transformer .......[OKAY]transformer . -[OKAY] ............ -[NO] transformer[NO] transformer....... ............................... [OKAY] - [OKAY][NO][NO] - .............. [OKAY][OKAY]stochastic_transformer - - . stochastic_transformer[NO] stochastic_transformer ........ .[NO][OKAY] -[NO]....... .......[OKAY] -[OKAY] - [WARNING]  async_io requires the libraries: ['libaio-dev'] but are missing. Can be fixed by: `apt install libaio-dev`. - [WARNING]  async_io requires the libraries: ['libaio-dev'] but are missing. Can be fixed by: `apt install libaio-dev`.async_io ............... - [NO] ....... [NO] -async_iotransformer_inference ................. [NO][NO] .............. [NO][OKAY] - -utils .................. [YES] ......transformer_inference [OKAY].. - [NO] ....... quantizer[OKAY] -.............. [NO] ....... utils[OKAY] -.................. [YES] --------------------------------------------------...... - [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -/bin/sh: line 0: type: git: not found -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** ----------------------------------------------------------------------------------------------------- -DeepSpeed C++/CUDA extension op report - -DeepSpeed C++/CUDA extension op report-------------------------------------------------- - -----------------------------------------------------------------------------------------------------NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.DeepSpeed C++/CUDA extension op report-------------------------------------------------- - - -------------------------------------------------------------------------------------------------------------------------------------------------------JIT compiled ops requires ninja - - - -DeepSpeed C++/CUDA extension op reportJIT compiled ops requires ninjaNOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - - ----------------------------------------------------------------------------------------------------- - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.JIT compiled ops requires ninja - --------------------------------------------------- -JIT compiled ops requires ninja - [WARNING]  async_io requires the libraries: ['libaio-dev'] but are missing. Can be fixed by: `apt install libaio-dev`. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed general environment info:DeepSpeed general environment info: - -torch install pathtorch install path .............................. ['/gpfswork/rech/six/commun/conda/tr1-13B/lib/python3.8/site-packages/torch']['/gpfswork/rech/six/commun/conda/tr1-13B/lib/python3.8/site-packages/torch'] - -torch versiontorch version ........................................ 1.8.11.8.1 - -torch cuda versiontorch cuda version .............................. 11.111.1 - -nvcc versionnvcc version .......................................... 11.211.2 - -deepspeed install pathdeepspeed install path ...................... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr1-13B/DeepSpeed-big-science/deepspeed']['/gpfsssd/worksf/projects/rech/six/commun/code/tr1-13B/DeepSpeed-big-science/deepspeed'] - -deepspeed infodeepspeed info ...................................... 0.4.2+bc17042, bc17042, big-science0.4.2+bc17042, bc17042, big-science - -deepspeed wheel compiled w.deepspeed wheel compiled w. ............ torch 1.8, cuda 11.1torch 1.8, cuda 11.1 - -ninjaninjaninja ninja ...................................................... [OKAY].................. [OKAY] [OKAY] - -[OKAY] --------------------------------------------------- - -------------------------------------------------------------------------------------------------------------------------------------------------------op name - - -................op nameop name op name installed ................ ................................ .. installedinstalledinstalledcompatible -.. ..-------------------------------------------------- .. - compatiblecompatiblecompatible - - ----------------------------------------------------------------------------------------------------- --------------------------------------------------- - -cpu_adam ............... [YES] ...... cpu_adam[OKAY] cpu_adamcpu_adam............... - ...............[YES]............... ......[YES][YES] [OKAY]...... -...... fused_adam [OKAY] [OKAY] -............. - [NO] fused_adam....... .............[OKAY] -[NO] fused_adam.......fused_lamb [OKAY] fused_adam............. -............. [NO].............[NO] fused_lamb ....... .......[OKAY][NO]............. - [OKAY].......[NO] - .......[OKAY] -[OKAY]fused_lamb - fused_lamb............. .............sparse_attn[NO] [NO]................... [NO] ....... [OKAY].......[OKAY]sparse_attn - -[OKAY] ............ [NO] ....... [OKAY] - -transformertransformer ........................ [NO][NO] .............. [OKAY]sparse_attn[OKAY]sparse_attn - - ........................ [NO]stochastic_transformer [NO]stochastic_transformer ....... ........[OKAY]. - [NO][OKAY][NO]transformer -....... ....... [OKAY]transformer[OKAY] -............ - ............[NO] [NO]....... .......[OKAY] -[OKAY] -stochastic_transformerstochastic_transformer .. [NO][NO] .............. [OKAY][OKAY] - -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/tr1-13B/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr1-13B/DeepSpeed-big-science/deepspeed'] -deepspeed info ................... 0.4.2+bc17042, bc17042, big-science -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/tr1-13B/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr1-13B/DeepSpeed-big-science/deepspeed'] -deepspeed info ................... 0.4.2+bc17042, bc17042, big-science -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info:DeepSpeed general environment info: -DeepSpeed general environment info: -torch install path -torch install path .............................. torch install path ............... ['/gpfswork/rech/six/commun/conda/tr1-13B/lib/python3.8/site-packages/torch'] -['/gpfswork/rech/six/commun/conda/tr1-13B/lib/python3.8/site-packages/torch'] -torch version['/gpfswork/rech/six/commun/conda/tr1-13B/lib/python3.8/site-packages/torch']torch version - ........................................ 1.8.1torch version1.8.1 - -....................torch cuda versiontorch cuda version 1.8.1.............................. - 11.111.1 -torch cuda version -nvcc version nvcc version ............... ..................... ..................... 11.1 11.2 -11.2 - -nvcc versiondeepspeed install pathdeepspeed install path ........................................... 11.2 -['/gpfsssd/worksf/projects/rech/six/commun/code/tr1-13B/DeepSpeed-big-science/deepspeed']['/gpfsssd/worksf/projects/rech/six/commun/code/tr1-13B/DeepSpeed-big-science/deepspeed']deepspeed install path - - deepspeed info...........deepspeed info ...................................... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr1-13B/DeepSpeed-big-science/deepspeed'] 0.4.2+bc17042, bc17042, big-science -0.4.2+bc17042, bc17042, big-science - -deepspeed infodeepspeed wheel compiled w. deepspeed wheel compiled w. ................... ...... ......0.4.2+bc17042, bc17042, big-science -torch 1.8, cuda 11.1torch 1.8, cuda 11.1 -deepspeed wheel compiled w. - ...... torch 1.8, cuda 11.1 -/bin/sh: line 0: type: git: not found -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -DeepSpeed general environment info:DeepSpeed general environment info: - -torch install pathtorch install path .............................. ['/gpfswork/rech/six/commun/conda/tr1-13B/lib/python3.8/site-packages/torch']['/gpfswork/rech/six/commun/conda/tr1-13B/lib/python3.8/site-packages/torch'] - -torch versiontorch version ........................................ 1.8.11.8.1 - -torch cuda versiontorch cuda version .............................. 11.111.1 - -nvcc versionnvcc version .......................................... 11.211.2 - -deepspeed install pathdeepspeed install path ...................... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr1-13B/DeepSpeed-big-science/deepspeed']['/gpfsssd/worksf/projects/rech/six/commun/code/tr1-13B/DeepSpeed-big-science/deepspeed'] - -deepspeed info deepspeed info................... ...................0.4.2+bc17042, bc17042, big-science -0.4.2+bc17042, bc17042, big-sciencedeepspeed wheel compiled w. - deepspeed wheel compiled w....... ......torch 1.8, cuda 11.1 -torch 1.8, cuda 11.1 - [WARNING]  async_io requires the libraries: ['libaio-dev'] but are missing. Can be fixed by: `apt install libaio-dev`. [WARNING]  async_io requires the libraries: ['libaio-dev'] but are missing. Can be fixed by: `apt install libaio-dev`. - -async_io async_io............... [NO]............... ....... [NO][NO] -....... [NO] -transformer_inference ..transformer_inference [NO].. .......[NO] [OKAY]....... - [OKAY] -utils utils.................. ..................[YES] [YES]...... ......[OKAY] -[OKAY] -quantizer ..............quantizer [NO].............. .......[NO] [OKAY]....... - [OKAY] --------------------------------------------------- --------------------------------------------------- -/bin/sh: line 0: type: git: not found -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/tr1-13B/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr1-13B/DeepSpeed-big-science/deepspeed'] -deepspeed info ................... 0.4.2+bc17042, bc17042, big-science -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io requires the libraries: ['libaio-dev'] but are missing. Can be fixed by: `apt install libaio-dev`. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io requires the libraries: ['libaio-dev'] but are missing. Can be fixed by: `apt install libaio-dev`. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/tr1-13B/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr1-13B/DeepSpeed-big-science/deepspeed'] -deepspeed info ................... 0.4.2+bc17042, bc17042, big-science -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/tr1-13B/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr1-13B/DeepSpeed-big-science/deepspeed'] -deepspeed info ................... 0.4.2+bc17042, bc17042, big-science -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/tr1-13B/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr1-13B/DeepSpeed-big-science/deepspeed'] -deepspeed info ................... 0.4.2+bc17042, bc17042, big-science -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io requires the libraries: ['libaio-dev'] but are missing. Can be fixed by: `apt install libaio-dev`. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io requires the libraries: ['libaio-dev'] but are missing. Can be fixed by: `apt install libaio-dev`. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/tr1-13B/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr1-13B/DeepSpeed-big-science/deepspeed'] -deepspeed info ................... 0.4.2+bc17042, bc17042, big-science -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io requires the libraries: ['libaio-dev'] but are missing. Can be fixed by: `apt install libaio-dev`. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/tr1-13B/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr1-13B/DeepSpeed-big-science/deepspeed'] -deepspeed info ................... 0.4.2+bc17042, bc17042, big-science -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/tr1-13B/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr1-13B/DeepSpeed-big-science/deepspeed'] -deepspeed info ...................DeepSpeed general environment info: 0.4.2+bc17042, bc17042, big-science - -deepspeed wheel compiled w. ...... torch install pathtorch 1.8, cuda 11.1 -............... ['/gpfswork/rech/six/commun/conda/tr1-13B/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr1-13B/DeepSpeed-big-science/deepspeed'] -deepspeed info ................... 0.4.2+bc17042, bc17042, big-science -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io requires the libraries: ['libaio-dev'] but are missing. Can be fixed by: `apt install libaio-dev`. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -/bin/sh: line 0: type: git: not found -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** - [WARNING]  async_io requires the libraries: ['libaio-dev'] but are missing. Can be fixed by: `apt install libaio-dev`. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** - [WARNING]  async_io requires the libraries: ['libaio-dev'] but are missing. Can be fixed by: `apt install libaio-dev`. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/tr1-13B/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path DeepSpeed general environment info:........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr1-13B/DeepSpeed-big-science/deepspeed'] - -deepspeed info ................... torch install path0.4.2+bc17042, bc17042, big-science -...............deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -['/gpfswork/rech/six/commun/conda/tr1-13B/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr1-13B/DeepSpeed-big-science/deepspeed'] -deepspeed info ................... 0.4.2+bc17042, bc17042, big-science -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io requires the libraries: ['libaio-dev'] but are missing. Can be fixed by: `apt install libaio-dev`. [WARNING]  async_io requires the libraries: ['libaio-dev'] but are missing. Can be fixed by: `apt install libaio-dev`. - -async_io ............... async_io[NO] ...................... [NO][NO] -....... [NO] -transformer_inference .. [NO]transformer_inference ....... ..[OKAY] -[NO] ....... [OKAY] -utils .................. [YES] ......utils [OKAY].................. - [YES] quantizer...... ..............[OKAY] - [NO] .......quantizer [OKAY].............. - [NO] ....... [OKAY]-------------------------------------------------- - --------------------------------------------------- -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -/bin/sh: line 0: type: git: not found -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -DeepSpeed general environment info:DeepSpeed general environment info: - -torch install pathtorch install path .............................. ['/gpfswork/rech/six/commun/conda/tr1-13B/lib/python3.8/site-packages/torch']['/gpfswork/rech/six/commun/conda/tr1-13B/lib/python3.8/site-packages/torch'] - -torch versiontorch version ........................................ 1.8.11.8.1 - -torch cuda versiontorch cuda version .............................. 11.111.1 - -nvcc versionnvcc version .......................................... 11.211.2 - -deepspeed install pathdeepspeed install path ...................... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr1-13B/DeepSpeed-big-science/deepspeed']['/gpfsssd/worksf/projects/rech/six/commun/code/tr1-13B/DeepSpeed-big-science/deepspeed'] - -deepspeed infodeepspeed info ...................................... 0.4.2+bc17042, bc17042, big-science0.4.2+bc17042, bc17042, big-science - -deepspeed wheel compiled w.deepspeed wheel compiled w. ............ torch 1.8, cuda 11.1torch 1.8, cuda 11.1 - -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** - [WARNING]  async_io requires the libraries: ['libaio-dev'] but are missing. Can be fixed by: `apt install libaio-dev`. - [WARNING]  async_io requires the libraries: ['libaio-dev'] but are missing. Can be fixed by: `apt install libaio-dev`. -async_io ............... [NO] ....... [NO] -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -transformer_inference ..utils [NO].................. .......[YES] [OKAY]...... - [OKAY] -quantizerutils ................................ [NO][YES] ............. [OKAY][OKAY] - --------------------------------------------------- -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/tr1-13B/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr1-13B/DeepSpeed-big-science/deepspeed'] -deepspeed info ................... 0.4.2+bc17042, bc17042, big-science -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -/bin/sh: line 0: type: git: not found -/bin/sh: line 0: type: git: not found -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -**** Git info for Megatron: git_hash=unknown git_branch=unknown ******** Git info for Megatron: git_hash=unknown git_branch=unknown **** - - [WARNING]  async_io requires the libraries: ['libaio-dev'] but are missing. Can be fixed by: `apt install libaio-dev`. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io requires the libraries: ['libaio-dev'] but are missing. Can be fixed by: `apt install libaio-dev`. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io requires the libraries: ['libaio-dev'] but are missing. Can be fixed by: `apt install libaio-dev`. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] - [WARNING]  async_io requires the libraries: ['libaio-dev'] but are missing. Can be fixed by: `apt install libaio-dev`.utils - .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY]async_io - ............... --------------------------------------------------[NO] - ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed general environment info: -DeepSpeed general environment info:torch install path -............... torch install path ............... ['/gpfswork/rech/six/commun/conda/tr1-13B/lib/python3.8/site-packages/torch'] -torch version ....................['/gpfswork/rech/six/commun/conda/tr1-13B/lib/python3.8/site-packages/torch'] -1.8.1 -torch version torch cuda version.................... ...............1.8.1 -11.1 -torch cuda versionnvcc version .................................... 11.111.2 - -nvcc versiondeepspeed install path ................................ 11.2 -['/gpfsssd/worksf/projects/rech/six/commun/code/tr1-13B/DeepSpeed-big-science/deepspeed']deepspeed install path - deepspeed info........... ................... 0.4.2+bc17042, bc17042, big-science['/gpfsssd/worksf/projects/rech/six/commun/code/tr1-13B/DeepSpeed-big-science/deepspeed'] - -deepspeed wheel compiled w.deepspeed info ......................... torch 1.8, cuda 11.10.4.2+bc17042, bc17042, big-science - -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/tr1-13B/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr1-13B/DeepSpeed-big-science/deepspeed'] -deepspeed info ................... 0.4.2+bc17042, bc17042, big-science -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/tr1-13B/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr1-13B/DeepSpeed-big-science/deepspeed'] -deepspeed info ................... 0.4.2+bc17042, bc17042, big-science -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 ----------------------------------------------------------------------------------------------------- - -DeepSpeed C++/CUDA extension op reportDeepSpeed C++/CUDA extension op report - ----------------------------------------------------------------------------------------------------- - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - ----------------------------------------------------------------------------------------------------- - -JIT compiled ops requires ninjaJIT compiled ops requires ninja - --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- ---------------------------------------------------NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - ---------------------------------------------------DeepSpeed C++/CUDA extension op report - -JIT compiled ops requires ninja-------------------------------------------------- - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja -/bin/sh: line 0: type: git: not found -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -ninjaninjaninjaninja ...................................................... .................. [OKAY] [OKAY][OKAY] - -[OKAY] --------------------------------------------------- - ------------------------------------------------------------------------------------------------------------------------------------------------------- - - -op nameop nameop name op name ................ ................................installed ................ installed .. installedinstalled .. compatible .... - compatible --------------------------------------------------compatible -compatible - - ------------------------------------------------------------------------------------------------------------------------------------------------------- - - -cpu_adam ............... cpu_adamcpu_adamcpu_adam[YES] .............................. ...... [YES][YES]...............[OKAY] -............[YES] [OKAY][OKAY]...... - - [OKAY] -fused_adam ............. fused_adamfused_adam[NO] fused_adam ................................. .............[NO][NO][OKAY] -.......[NO]....... [OKAY]fused_lamb -.......[OKAY] -.............[OKAY]fused_lamb -fused_lamb[NO] .................................fused_lamb [NO][OKAY][NO] -............. ..............[NO] [OKAY][OKAY]....... - - [OKAY] -sparse_attn ............ [NO] sparse_attn....... sparse_attnsparse_attn [OKAY] ........................ -............ [NO][NO][NO]transformer ................................. [OKAY][NO][OKAY][OKAY] - -....... -transformer transformer[OKAY]transformer - .................................... [NO]stochastic_transformer[NO] [NO] ..................... [OKAY].[OKAY][OKAY] - - -[NO] stochastic_transformer....... stochastic_transformerstochastic_transformer[OKAY] . - [NO]. ........ [NO] [NO] [OKAY] ....... -....... [OKAY][OKAY] - -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/tr1-13B/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr1-13B/DeepSpeed-big-science/deepspeed'] -deepspeed info ................... 0.4.2+bc17042, bc17042, big-science -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -/bin/sh: line 0: type: git: not found -/bin/sh: line 0: type: git: not found -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -**** Git info for Megatron: git_hash=unknown git_branch=unknown ******** Git info for Megatron: git_hash=unknown git_branch=unknown **** - -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/tr1-13B/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr1-13B/DeepSpeed-big-science/deepspeed'] -deepspeed info ................... 0.4.2+bc17042, bc17042, big-science -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/tr1-13B/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr1-13B/DeepSpeed-big-science/deepspeed'] -deepspeed info ................... 0.4.2+bc17042, bc17042, big-science -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -/bin/sh: line 0: type: git: not found -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/tr1-13B/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr1-13B/DeepSpeed-big-science/deepspeed'] -deepspeed info ................... 0.4.2+bc17042, bc17042, big-science -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/tr1-13B/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr1-13B/DeepSpeed-big-science/deepspeed'] -/bin/sh: line 0: type: git: not found -deepspeed info ................... 0.4.2+bc17042, bc17042, big-science -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/tr1-13B/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr1-13B/DeepSpeed-big-science/deepspeed'] -deepspeed info ................... 0.4.2+bc17042, bc17042, big-science -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -/bin/sh: line 0: type: git: not found -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -/bin/sh: line 0: type: git: not found -DeepSpeed general environment info:DeepSpeed general environment info: - -torch install pathtorch install path .............................. ['/gpfswork/rech/six/commun/conda/tr1-13B/lib/python3.8/site-packages/torch']['/gpfswork/rech/six/commun/conda/tr1-13B/lib/python3.8/site-packages/torch'] - -torch versiontorch version ........................................ 1.8.11.8.1 - -torch cuda versiontorch cuda version .............................. 11.111.1 - -nvcc versionnvcc version .......................................... 11.211.2 - -deepspeed install pathdeepspeed install path ...................... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr1-13B/DeepSpeed-big-science/deepspeed']['/gpfsssd/worksf/projects/rech/six/commun/code/tr1-13B/DeepSpeed-big-science/deepspeed'] - -deepspeed infodeepspeed info ...................................... 0.4.2+bc17042, bc17042, big-science0.4.2+bc17042, bc17042, big-science - -deepspeed wheel compiled w.deepspeed wheel compiled w. ............ torch 1.8, cuda 11.1torch 1.8, cuda 11.1 - -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -using world size: 256, data-parallel-size: 8, tensor-model-parallel size: 4, pipeline-model-parallel size: 8 -using torch.float16 for parameters ... ------------------------- arguments ------------------------ - accumulate_allreduce_grads_in_fp32 .............. False - adam_beta1 ...................................... 0.9 - adam_beta2 ...................................... 0.999 - adam_eps ........................................ 1e-08 - adlr_autoresume ................................. False - adlr_autoresume_interval ........................ 1000 - apply_query_key_layer_scaling ................... True - apply_residual_connection_post_layernorm ........ False - attention_dropout ............................... 0.1 - attention_softmax_in_fp32 ....................... False - bert_binary_head ................................ True - bert_load ....................................... None - bf16 ............................................ False - bias_dropout_fusion ............................. True - bias_gelu_fusion ................................ True - biencoder_projection_dim ........................ 0 - biencoder_shared_query_context_model ............ False - block_data_path ................................. None - checkpoint_activations .......................... True - checkpoint_in_cpu ............................... False - checkpoint_num_layers ........................... 1 - clip_grad ....................................... 1.0 - codecarbon_dir .................................. /gpfsscratch/rech/six/commun/checkpoints/tr8-104B/tr8-104B-data/codecarbon - consumed_train_samples .......................... 0 - consumed_valid_samples .......................... 0 - contigious_checkpointing ........................ False - cpu_optimizer ................................... False - cpu_torch_adam .................................. False - data_impl ....................................... mmap - data_parallel_size .............................. 8 - data_path ....................................... ['/gpfswork/rech/six/commun/datasets-custom/oscar-en/meg-gpt2_text_document'] - dataloader_type ................................. single - DDP_impl ........................................ local - decoder_seq_length .............................. None - deepscale ....................................... False - deepscale_config ................................ None - deepspeed ....................................... True - deepspeed_activation_checkpointing .............. True - deepspeed_config ................................ ./ds_config.1161730.json - deepspeed_mpi ................................... False - distribute_checkpointed_activations ............. False - distributed_backend ............................. nccl - embedding_path .................................. None - encoder_seq_length .............................. 2048 - eod_mask_loss ................................... False - eval_interval ................................... 1000 - eval_iters ...................................... 5 - evidence_data_path .............................. None - exit_duration_in_mins ........................... 110 - exit_interval ................................... None - ffn_hidden_size ................................. 20480 - finetune ........................................ False - fp16 ............................................ True - fp16_lm_cross_entropy ........................... False - fp32_residual_connection ........................ False - global_batch_size ............................... 2048 - hidden_dropout .................................. 0.1 - hidden_size ..................................... 16384 - hysteresis ...................................... 2 - ict_head_size ................................... None - ict_load ........................................ None - img_dim ......................................... 224 - indexer_batch_size .............................. 128 - indexer_log_interval ............................ 1000 - init_method_std ................................. 0.02 - init_method_xavier_uniform ...................... False - initial_loss_scale .............................. 4294967296 - kv_channels ..................................... 512 - layernorm_epsilon ............................... 1e-05 - lazy_mpu_init ................................... None - load ............................................ /gpfsscratch/rech/six/commun/checkpoints/tr8-104B/checkpoints - local_rank ...................................... 0 - log_batch_size_to_tensorboard ................... True - log_interval .................................... 1 - log_learning_rate_to_tensorboard ................ True - log_loss_scale_to_tensorboard ................... True - log_num_zeros_in_grad ........................... False - log_params_norm ................................. False - log_timers_to_tensorboard ....................... True - log_validation_ppl_to_tensorboard ............... True - loss_scale ...................................... 12.0 - loss_scale_window ............................... 1000 - lr .............................................. 6e-05 - lr_decay_iters .................................. None - lr_decay_samples ................................ 126953125 - lr_decay_style .................................. cosine - lr_warmup_fraction .............................. None - lr_warmup_iters ................................. 0 - lr_warmup_samples ............................... 216320 - make_vocab_size_divisible_by .................... 128 - mask_prob ....................................... 0.15 - masked_softmax_fusion ........................... True - max_position_embeddings ......................... 2048 - memory_centric_tiled_linear ..................... False - merge_file ...................................... /gpfswork/rech/six/commun/code/tr8-104B/Megatron-DeepSpeed-tr8-104B/data/gpt2-merges.txt - micro_batch_size ................................ 1 - min_loss_scale .................................. 1.0 - min_lr .......................................... 6e-06 - mmap_warmup ..................................... False - no_load_optim ................................... None - no_load_rng ..................................... None - no_save_optim ................................... None - no_save_rng ..................................... None - num_attention_heads ............................. 32 - num_channels .................................... 3 - num_classes ..................................... 1000 - num_layers ...................................... 32 - num_layers_per_virtual_pipeline_stage ........... None - num_workers ..................................... 2 - onnx_safe ....................................... None - openai_gelu ..................................... False - optimizer ....................................... adam - override_lr_scheduler ........................... False - params_dtype .................................... torch.float16 - partition_activations ........................... False - patch_dim ....................................... 16 - pipeline_model_parallel_size .................... 8 - position_embedding_type ......................... PositionEmbeddingType.absolute - profile_backward ................................ False - query_in_block_prob ............................. 0.1 - rampup_batch_size ............................... ['16', '16', '6_000_000'] - rank ............................................ 0 - remote_device ................................... none - reset_attention_mask ............................ False - reset_position_ids .............................. False - retriever_report_topk_accuracies ................ [] - retriever_score_scaling ......................... False - retriever_seq_length ............................ 256 - sample_rate ..................................... 1.0 - save ............................................ /gpfsscratch/rech/six/commun/checkpoints/tr8-104B/checkpoints - save_interval ................................... 1500 - scatter_gather_tensors_in_pipeline .............. True - scattered_embeddings ............................ False - seed ............................................ 42 - seq_length ...................................... 2048 - sgd_momentum .................................... 0.9 - short_seq_prob .................................. 0.1 - split ........................................... 949,50,1 - split_transformers .............................. False - synchronize_each_layer .......................... False - tensor_model_parallel_size ...................... 4 - tensorboard_dir ................................. /gpfsscratch/rech/six/commun/checkpoints/tr8-104B/tr8-104B-data/tensorboard - tensorboard_log_interval ........................ 1 - tensorboard_queue_size .......................... 5 - tile_factor ..................................... 1 - titles_data_path ................................ None - tokenizer_name_or_path .......................... None - tokenizer_type .................................. GPT2BPETokenizer - train_iters ..................................... None - train_samples ................................... 300000000 - use_checkpoint_lr_scheduler ..................... False - use_contiguous_buffers_in_ddp ................... False - use_cpu_initialization .......................... None - use_one_sent_docs ............................... False - use_pin_memory .................................. False - virtual_pipeline_model_parallel_size ............ None - vocab_extra_ids ................................. 0 - vocab_file ...................................... /gpfswork/rech/six/commun/code/tr8-104B/Megatron-DeepSpeed-tr8-104B/data/gpt2-vocab.json - weight_decay .................................... 0.1 - world_size ...................................... 256 - zero_allgather_bucket_size ...................... 0.0 - zero_contigious_gradients ....................... False - zero_reduce_bucket_size ......................... 0.0 - zero_reduce_scatter ............................. False - zero_stage ...................................... 1 --------------------- end of arguments --------------------- -will use batch size rampup starting from global batch size 16 to global batch size 2048 with batch size increments 16 over 6000000 samples. -> building GPT2BPETokenizer tokenizer ... -/bin/sh: line 0: type: git: not found -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown ******** Git info for Megatron: git_hash=unknown git_branch=unknown **** - -/bin/sh: line 0: type: git: not found -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/tr1-13B/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr1-13B/DeepSpeed-big-science/deepspeed'] -deepspeed info ................... 0.4.2+bc17042, bc17042, big-science -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/tr1-13B/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr1-13B/DeepSpeed-big-science/deepspeed'] -deepspeed info ................... 0.4.2+bc17042, bc17042, big-science -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info:DeepSpeed general environment info: - -torch install pathtorch install path .............................. ['/gpfswork/rech/six/commun/conda/tr1-13B/lib/python3.8/site-packages/torch']['/gpfswork/rech/six/commun/conda/tr1-13B/lib/python3.8/site-packages/torch'] - -torch versiontorch version ........................................ 1.8.11.8.1 - -torch cuda versiontorch cuda version .............................. 11.111.1 - -nvcc versionnvcc version .......................................... 11.211.2 - -deepspeed install pathdeepspeed install path ...................... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr1-13B/DeepSpeed-big-science/deepspeed']['/gpfsssd/worksf/projects/rech/six/commun/code/tr1-13B/DeepSpeed-big-science/deepspeed'] - -deepspeed infodeepspeed info ...................................... 0.4.2+bc17042, bc17042, big-science0.4.2+bc17042, bc17042, big-science - -deepspeed wheel compiled w.deepspeed wheel compiled w. ............ torch 1.8, cuda 11.1torch 1.8, cuda 11.1 - - [WARNING]  async_io requires the libraries: ['libaio-dev'] but are missing. Can be fixed by: `apt install libaio-dev`. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io requires the libraries: ['libaio-dev'] but are missing. Can be fixed by: `apt install libaio-dev`. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/tr1-13B/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... DeepSpeed general environment info:['/gpfsssd/worksf/projects/rech/six/commun/code/tr1-13B/DeepSpeed-big-science/deepspeed'] -deepspeed info - ................... 0.4.2+bc17042, bc17042, big-science -torch install pathdeepspeed wheel compiled w. ..................... torch 1.8, cuda 11.1 -['/gpfswork/rech/six/commun/conda/tr1-13B/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr1-13B/DeepSpeed-big-science/deepspeed'] -deepspeed info ................... 0.4.2+bc17042, bc17042, big-science -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 ----------------------------------------------------------------------------------------------------- - -DeepSpeed C++/CUDA extension op reportDeepSpeed C++/CUDA extension op report - ------------------------------------------------------------------------------------------------------------------------------------------------------- - - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.DeepSpeed C++/CUDA extension op report - - ------------------------------------------------------------------------------------------------------------------------------------------------------- --------------------------------------------------- -JIT compiled ops requires ninja -JIT compiled ops requires ninja -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. -DeepSpeed C++/CUDA extension op report - - ----------------------------------------------------------------------------------------------------- - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.JIT compiled ops requires ninja - --------------------------------------------------- -JIT compiled ops requires ninja -ninjaninjaninjaninja ........................................................................ [OKAY][OKAY][OKAY][OKAY] - - - --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- - - - -op nameop nameop name op name................................ ................ installed................ installed installed.. installed .. .. compatible.. compatible - compatible--------------------------------------------------compatible - - - ----------------------------------------------------------------------------------------------------- --------------------------------------------------- - -cpu_adam ...............cpu_adam cpu_adamcpu_adam[YES] ................................................... [YES][YES] [OKAY] [YES]...... -...... ......[OKAY][OKAY] -[OKAY] - -fused_adam ............. [NO] fused_adamfused_adam.......fused_adam ............. ............. .............[OKAY] [NO] [NO][NO] - .......fused_lamb.............. [OKAY].............[OKAY] -[OKAY] - -[NO]fused_lamb .......fused_lamb............. fused_lamb [OKAY] ............. -[NO]............. [NO].......[NO] ..............[OKAY] -[OKAY][OKAY] - -sparse_attn ............ [NO] ....... [OKAY] -sparse_attntransformersparse_attnsparse_attn ........................ [NO] ............................... [NO][OKAY][NO][NO] - ....... ....... transformer.......[OKAY] -[OKAY]............[OKAY] - - stochastic_transformer[NO] transformer.......transformer. ............ ............ [NO][OKAY] -[NO][NO]....... .......stochastic_transformer[OKAY]....... - [OKAY]. -[OKAY] -[NO] .......stochastic_transformer [OKAY]stochastic_transformer - . [NO]. .......[NO] [OKAY]....... - [OKAY] -/bin/sh: line 0: type: git: not found -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. ----------------------------------------------------------------------------------------------------- -JIT compiled ops requires ninja - -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja -/bin/sh: line 0: type: git: not found -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown ******** Git info for Megatron: git_hash=unknown git_branch=unknown **** - -ninja ninja.................. ..................[OKAY] -[OKAY] --------------------------------------------------- --------------------------------------------------- -op name op name................ ................ installedinstalled .... compatiblecompatible - ----------------------------------------------------------------------------------------------------- - -cpu_adamcpu_adam .............................. [YES][YES] ............ [OKAY] -[OKAY] -fused_adamfused_adam .......................... [NO][NO] .............. [OKAY][OKAY] - -fused_lambfused_lamb .......................... [NO][NO] .............. [OKAY][OKAY] - -sparse_attnsparse_attn ............ ............[NO] [NO]....... .......[OKAY] - [OKAY] -transformertransformer ............ ............[NO] [NO]....... .......[OKAY] - [OKAY] -stochastic_transformer stochastic_transformer . [NO]. ....... [NO][OKAY] - ....... [OKAY] -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [YES] ...... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] -ninja .................. [OKAY] --------------------------------------------------- -op name ................ installed .. compatible --------------------------------------------------- -cpu_adam ............... [YES] ...... [OKAY] -fused_adam ............. [NO] ....... [OKAY] -fused_lamb ............. [NO] ....... [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] ----------------------------------------------------------------------------------------------------- - -DeepSpeed C++/CUDA extension op reportDeepSpeed C++/CUDA extension op report - ------------------------------------------------------------------------------------------------------------------------------------------------------- - - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.DeepSpeed C++/CUDA extension op report-------------------------------------------------- - - - ------------------------------------------------------------------------------------------------------------------------------------------------------- - -DeepSpeed C++/CUDA extension op report -JIT compiled ops requires ninjaJIT compiled ops requires ninja -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - --------------------------------------------------- - ---------------------------------------------------NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - -JIT compiled ops requires ninja-------------------------------------------------- - -JIT compiled ops requires ninja -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/tr1-13B/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr1-13B/DeepSpeed-big-science/deepspeed'] -deepspeed info ................... 0.4.2+bc17042, bc17042, big-science -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/tr1-13B/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr1-13B/DeepSpeed-big-science/deepspeed'] -deepspeed info ................... 0.4.2+bc17042, bc17042, big-science -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -ninjaninjaninjaninja .................. ....................................[OKAY].................. - [OKAY][OKAY][OKAY] - --------------------------------------------------- ----------------------------------------------------------------------------------------------------- - --------------------------------------------------- -op name -op name op nameop name................................ ................ ................installed installedinstalledinstalled.. ..compatible.... - --------------------------------------------------compatiblecompatiblecompatible - - - ----------------------------------------------------------------------------------------------------- - --------------------------------------------------- -cpu_adam ............... [YES] cpu_adam......cpu_adam ...............cpu_adam[OKAY]............... - [YES][YES] ............... ............ [OKAY] [OKAY] -[YES] -fused_adam ................... [OKAY][NO] - ....... [OKAY]fused_adam -fused_adam ..........................fused_lamb .............[NO][NO] [NO]....... ....... ....... fused_adam[OKAY] [OKAY] -[OKAY] - - .............fused_lamb fused_lamb [NO] ............. ............. .......[NO] sparse_attn [NO][OKAY] ....... ............ -[OKAY]....... -[NO]fused_lamb[OKAY] -....... .............[OKAY] -[NO] .......transformer sparse_attn............[OKAY] sparse_attn............[NO] - ...................[NO] [NO] [OKAY] ....... -....... [OKAY][OKAY] -stochastic_transformer - transformertransformer . sparse_attn ............ [NO] ........................[NO] [NO].............. .......[OKAY][NO][OKAY] - -[OKAY] -stochastic_transformer.......stochastic_transformer [OKAY] . - .[NO] [NO]transformer....... .......[OKAY] - [OKAY]............ -[NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -/bin/sh: line 0: type: git: not found -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown ******** Git info for Megatron: git_hash=unknown git_branch=unknown **** - - [WARNING]  async_io requires the libraries: ['libaio-dev'] but are missing. Can be fixed by: `apt install libaio-dev`. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io requires the libraries: ['libaio-dev'] but are missing. Can be fixed by: `apt install libaio-dev`. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io requires the libraries: ['libaio-dev'] but are missing. Can be fixed by: `apt install libaio-dev`. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -/bin/sh: line 0: type: git: not found -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** ------------------------------------------------------------------------------------------------------------------------------------------------------- - - -DeepSpeed C++/CUDA extension op reportDeepSpeed C++/CUDA extension op reportDeepSpeed C++/CUDA extension op report - --------------------------------------------------- ----------------------------------------------------------------------------------------------------- - - ---------------------------------------------------DeepSpeed C++/CUDA extension op report -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. ----------------------------------------------------------------------------------------------------- - - ---------------------------------------------------NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.--------------------------------------------------JIT compiled ops requires ninja - - -JIT compiled ops requires ninja -JIT compiled ops requires ninja-------------------------------------------------- - - -JIT compiled ops requires ninja - [WARNING]  async_io requires the libraries: ['libaio-dev'] but are missing. Can be fixed by: `apt install libaio-dev`. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -/bin/sh: line 0: type: git: not found -ninjaninjaninjaninja ........................................................................ [OKAY] [OKAY][OKAY][OKAY] - - - ------------------------------------------------------------------------------------------------------------------------------------------------------- --------------------------------------------------- - -op name -op name op name ................op name ................ installed ................installed .................. .. installedcompatiblecompatibleinstalled - - ..----------------------------------------------------------------------------------------------------.. - - compatiblecompatible - ----------------------------------------------------------------------------------------------------- - -cpu_adamcpu_adam .............................. cpu_adam [YES]cpu_adam [YES] ...... .............................. ......[OKAY] -[YES][YES][OKAY] -............ [OKAY][OKAY] - -fused_adam ............. [NO] fused_adam....... fused_adamfused_adam .............[OKAY]............. .............[NO] - [NO]....... [NO] .......fused_lamb [OKAY] ....... -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -[OKAY]............. -[OKAY]fused_lamb[NO] - fused_lamb.................... .............fused_lamb[NO][OKAY] [NO]....... -............. [OKAY].......[NO] - [OKAY]....... - [OKAY] -sparse_attn ............ sparse_attn[NO] ................... sparse_attn[OKAY]sparse_attn[NO] - ........................transformer....... [NO][OKAY]............[NO] - [NO].............. transformer....... [OKAY] ............[OKAY] -[OKAY][NO] - - transformer.......transformer stochastic_transformer[OKAY]........................ - [NO][NO] . stochastic_transformer ....... [NO]....... [OKAY]. .......[OKAY] -[NO][OKAY] - -....... [OKAY] -stochastic_transformerstochastic_transformer .. [NO][NO] .............. [OKAY][OKAY] - ------------------------------------------------------------------------------------------------------------------------------------------------------- - - -DeepSpeed C++/CUDA extension op reportDeepSpeed C++/CUDA extension op reportDeepSpeed C++/CUDA extension op report - - ----------------------------------------------------------------------------------------------------- --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. ----------------------------------------------------------------------------------------------------- --------------------------------------------------- - -JIT compiled ops requires ninja-------------------------------------------------- - -DeepSpeed C++/CUDA extension op report -JIT compiled ops requires ninja - -JIT compiled ops requires ninja-------------------------------------------------- - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja - [WARNING]  async_io requires the libraries: ['libaio-dev'] but are missing. Can be fixed by: `apt install libaio-dev`. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -ninjaninja ninja.................. ninja.................. ....................................[OKAY] [OKAY] -[OKAY] -[OKAY]-------------------------------------------------- --------------------------------------------------- - - -----------------------------------------------------------------------------------------------------op nameop name - - op name................................ op nameinstalled ................ installed..installed ....................compatible -compatiblecompatible -installed --------------------------------------------------- ---------------------------------------------------------------------------------------------------- -.. - compatible - --------------------------------------------------- - [WARNING]  async_io requires the libraries: ['libaio-dev'] but are missing. Can be fixed by: `apt install libaio-dev`. -cpu_adam ............... cpu_adam[YES] .....................cpu_adam [YES][OKAY]............... -...... cpu_adam[YES][OKAY] -async_io ............... [NO] ....... [NO] -fused_adam...... ............................[OKAY] -[NO] .......[YES] [OKAY]fused_adam -...... .............[OKAY] fused_lamb -transformer_inference .. [NO] ....... [OKAY] -[NO] fused_adam .......................... .......[NO] [NO] [OKAY] .............. -fused_lambfused_adam [OKAY] -[OKAY]............. -utils .................. [YES] ...... [OKAY] -............. [NO] fused_lamb.......[NO] ....................[OKAY] - sparse_attn[NO][OKAY] -quantizer .............. [NO] ....... [OKAY] - ............ .......fused_lamb[NO] [OKAY].................... --------------------------------------------------- - [OKAY][NO] -sparse_attn ............ [NO]transformer .......................... [NO][OKAY] [OKAY] -....... -sparse_attn [OKAY] transformer - ........................ stochastic_transformer [NO][NO]. .............. [NO] [OKAY]sparse_attn -[OKAY] - transformer.......stochastic_transformer............ [OKAY]............[NO] - . [NO] ....... [OKAY].......[NO] [OKAY]....... - - [OKAY]transformer stochastic_transformer............ -. [NO][NO] ....... .......[OKAY] -[OKAY] -stochastic_transformer . [NO] ....... [OKAY] - > padded vocab (size: 50257) with 431 dummy tokens (new size: 50688) - [WARNING]  async_io requires the libraries: ['libaio-dev'] but are missing. Can be fixed by: `apt install libaio-dev`. - [WARNING]  async_io requires the libraries: ['libaio-dev'] but are missing. Can be fixed by: `apt install libaio-dev`. -async_io ............... [NO] ....... [NO]async_io - ............... [NO] ....... [NO] -transformer_inference .. [NO] transformer_inference....... ..[OKAY] -[NO] ....... [OKAY]utils - .................. [YES] ...... [OKAY]utils - .................. [YES] quantizer...... ..............[OKAY] -[NO] ....... [OKAY] -quantizer .............. [NO]-------------------------------------------------- -....... [OKAY] --------------------------------------------------- --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja --------------------------------------------------- -DeepSpeed C++/CUDA extension op report ----------------------------------------------------------------------------------------------------- - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. -DeepSpeed C++/CUDA extension op report-------------------------------------------------- - ---------------------------------------------------JIT compiled ops requires ninja - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/tr1-13B/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr1-13B/DeepSpeed-big-science/deepspeed'] -deepspeed info ................... 0.4.2+bc17042, bc17042, big-science -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io requires the libraries: ['libaio-dev'] but are missing. Can be fixed by: `apt install libaio-dev`. [WARNING]  async_io requires the libraries: ['libaio-dev'] but are missing. Can be fixed by: `apt install libaio-dev`. - -async_ioasync_io .............................. [NO][NO] .............. [NO][NO] - -transformer_inferencetransformer_inference .... [NO][NO] .............. [OKAY][OKAY] - -utilsutils .................................... [YES][YES] ............ [OKAY][OKAY] - -quantizerquantizer ............................ [NO][NO] .............. [OKAY][OKAY] - ----------------------------------------------------------------------------------------------------- - ------------------------------------------------------------------------------------------------------------------------------------------------------- - - ---------------------------------------------------DeepSpeed C++/CUDA extension op reportDeepSpeed C++/CUDA extension op reportDeepSpeed C++/CUDA extension op report - - - ----------------------------------------------------------------------------------------------------- ---------------------------------------------------DeepSpeed C++/CUDA extension op report - - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.--------------------------------------------------NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - - - -----------------------------------------------------------------------------------------------------NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.-------------------------------------------------- - - - -JIT compiled ops requires ninjaJIT compiled ops requires ninjaJIT compiled ops requires ninja --------------------------------------------------- - - -JIT compiled ops requires ninja -ninjaninjaninjaninja ...................................................... [OKAY][OKAY].................. - - [OKAY]----------------------------------------------------------------------------------------------------[OKAY] - - - ---------------------------------------------------op nameop name - --------------------------------------------------................op name................ - ................installedop nameinstalled ..installed.................. compatiblecompatible .. - -installed --------------------------------------------------compatible - -.. --------------------------------------------------compatible - --------------------------------------------------- --------------------------------------------------- -cpu_adam cpu_adam............... [YES]cpu_adam............... ......[YES] ............... [OKAY] ...... -[YES] [OKAY]cpu_adam -...... [OKAY] -...............fused_adam [YES].............fused_adam [NO]fused_adam .................... [OKAY][NO]...... - .................... fused_lamb[NO][OKAY] -....................[OKAY] fused_lamb[NO][OKAY] - -.................... fused_lamb[OKAY][NO] - .................... [NO][OKAY] -....... [OKAY] -fused_adam ............. [NO] .......sparse_attn ............ sparse_attn[NO]sparse_attn ............................... [OKAY] [NO] -[NO][OKAY] .......transformer -...................[OKAY] -fused_lamb[OKAY] [NO]transformer -.................... ............[NO][OKAY]transformer - .......[NO]............ .......stochastic_transformer[NO] [OKAY]........[OKAY] - - [OKAY][NO] - stochastic_transformer....... stochastic_transformer[OKAY] -. .[NO] [NO]....... .......[OKAY] -[OKAY] -sparse_attn ............ [NO] ....... [OKAY] -transformer ............ [NO] ....... [OKAY] -stochastic_transformer . [NO] ....... [OKAY] -ninjaninjaninjaninja .................. ...................................................... [OKAY][OKAY][OKAY][OKAY] - - - ----------------------------------------------------------------------------------------------------- ----------------------------------------------------------------------------------------------------- - - -op nameop name op nameop name ................ ................ ................................installedinstalled installed..installed.. .... compatible compatible -compatiblecompatible - --------------------------------------------------- ----------------------------------------------------------------------------------------------------- --------------------------------------------------- - - -cpu_adamcpu_adamcpu_adam ...............cpu_adam.............................. [YES] ...............[YES] [YES] ...... ......[YES] ...... [OKAY]......[OKAY] - - [OKAY][OKAY] - -fused_adam .............fused_adam fused_adam fused_adam[NO] ............. ............. .......[NO] ............. [NO][OKAY] ....... - [NO].......[OKAY] -fused_lamb[OKAY]....... - .............[OKAY]fused_lamb - [NO]fused_lamb............. fused_lamb ....... .............[NO] ............. [OKAY][NO]....... -[NO] ....... ....... [OKAY] [OKAY] - -[OKAY] -sparse_attn ............ [NO] ....... [OKAY] -sparse_attnsparse_attn transformersparse_attn............ ............[NO]........................ [NO] .......[NO][NO] [OKAY] -..................... [OKAY]transformer[OKAY] -[OKAY] - ............ -transformer [NO]stochastic_transformer ............transformer....... . [NO] ............ [OKAY] [NO][NO] - ....... ....... ....... [OKAY] stochastic_transformer -[OKAY][OKAY] - -stochastic_transformer. [NO]stochastic_transformer . ....... [OKAY].[NO] - [NO]....... .......[OKAY] -[OKAY] -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/tr1-13B/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr1-13B/DeepSpeed-big-science/deepspeed'] -deepspeed info ................... 0.4.2+bc17042, bc17042, big-science -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/tr1-13B/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr1-13B/DeepSpeed-big-science/deepspeed'] -deepspeed info ................... 0.4.2+bc17042, bc17042, big-science -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/tr1-13B/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr1-13B/DeepSpeed-big-science/deepspeed'] -deepspeed info ................... 0.4.2+bc17042, bc17042, big-science -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io requires the libraries: ['libaio-dev'] but are missing. Can be fixed by: `apt install libaio-dev`. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/tr1-13B/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr1-13B/DeepSpeed-big-science/deepspeed'] -deepspeed info ................... 0.4.2+bc17042, bc17042, big-science -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io requires the libraries: ['libaio-dev'] but are missing. Can be fixed by: `apt install libaio-dev`. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io requires the libraries: ['libaio-dev'] but are missing. Can be fixed by: `apt install libaio-dev`. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io requires the libraries: ['libaio-dev'] but are missing. Can be fixed by: `apt install libaio-dev`. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/tr1-13B/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr1-13B/DeepSpeed-big-science/deepspeed'] -deepspeed info ................... 0.4.2+bc17042, bc17042, big-science -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io requires the libraries: ['libaio-dev'] but are missing. Can be fixed by: `apt install libaio-dev`. [WARNING]  async_io requires the libraries: ['libaio-dev'] but are missing. Can be fixed by: `apt install libaio-dev`. - -async_io ............... async_io[NO] ...................... [NO][NO] - ....... [NO] -transformer_inference .. [NO] ....... [OKAY]transformer_inference - .. [NO] ....... utils[OKAY] -.................. [YES] ...... [OKAY]utils - .................. [YES] quantizer...... [OKAY].............. -[NO] ....... quantizer[OKAY] -.............. [NO] .......-------------------------------------------------- - [OKAY] --------------------------------------------------- - [WARNING]  async_io requires the libraries: ['libaio-dev'] but are missing. Can be fixed by: `apt install libaio-dev`. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io requires the libraries: ['libaio-dev'] but are missing. Can be fixed by: `apt install libaio-dev`. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer [WARNING]  async_io requires the libraries: ['libaio-dev'] but are missing. Can be fixed by: `apt install libaio-dev`. .............. [NO] - ....... [OKAY] --------------------------------------------------- -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io requires the libraries: ['libaio-dev'] but are missing. Can be fixed by: `apt install libaio-dev`. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io requires the libraries: ['libaio-dev'] but are missing. Can be fixed by: `apt install libaio-dev`. - [WARNING]  async_io requires the libraries: ['libaio-dev'] but are missing. Can be fixed by: `apt install libaio-dev`. -async_io ............... async_io[NO] ...................... [NO][NO] - ....... [NO] -transformer_inference .. [NO]transformer_inference ......... [OKAY][NO] - ....... [OKAY] -utils utils.................. ..................[YES] [YES] ............ [OKAY][OKAY] - -quantizerquantizer ............................ [NO][NO] .............. [OKAY][OKAY] - ----------------------------------------------------------------------------------------------------- - -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** ----------------------------------------------------------------------------------------------------- - -DeepSpeed C++/CUDA extension op reportDeepSpeed C++/CUDA extension op report - ----------------------------------------------------------------------------------------------------- - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.-------------------------------------------------- - - -----------------------------------------------------------------------------------------------------DeepSpeed C++/CUDA extension op report - --------------------------------------------------- -JIT compiled ops requires ninjaJIT compiled ops requires ninja --------------------------------------------------- - - -DeepSpeed C++/CUDA extension op reportNOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - ----------------------------------------------------------------------------------------------------- - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.JIT compiled ops requires ninja - --------------------------------------------------- -JIT compiled ops requires ninja -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/tr1-13B/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr1-13B/DeepSpeed-big-science/deepspeed'] -deepspeed info ................... 0.4.2+bc17042, bc17042, big-science -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -ninjaninjaninjaninja .................. ...................................................... [OKAY][OKAY][OKAY][OKAY] - - - ----------------------------------------------------------------------------------------------------- ----------------------------------------------------------------------------------------------------- -op name - - op name................ op nameop name................ ................installed................installed installedinstalled.. .... .. compatiblecompatible compatible - -compatible ----------------------------------------------------------------------------------------------------- --------------------------------------------------- - - --------------------------------------------------- -cpu_adamcpu_adam cpu_adam ............... cpu_adam .............................. [YES] [YES][YES] ..................... ............ [OKAY] [YES] -[OKAY][OKAY] - -...... [OKAY] -fused_adam fused_adamfused_adam............. .............fused_adam .............[NO] ............. [NO] [NO].......[NO]....... [OKAY].......[OKAY]....... - - [OKAY][OKAY] -fused_lamb -fused_lamb fused_lamb ............. .............fused_lamb .............[NO][NO] ............. [NO] ..............[NO] [OKAY] [OKAY].............. - - [OKAY][OKAY] - -sparse_attnsparse_attnsparse_attn ............ sparse_attn........................ ............ [NO] [NO][NO][NO]....... ..............[OKAY]....... - [OKAY][OKAY][OKAY] - -transformer -transformer transformer ............ ............transformer............ [NO][NO]............[NO] ....... .......[NO] .......[OKAY] -.......[OKAY][OKAY] - -[OKAY]stochastic_transformer - stochastic_transformerstochastic_transformer . stochastic_transformer. [NO][NO]. . ....... ....... [NO] [NO][OKAY] -[OKAY] -.............. [OKAY][OKAY] - -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** - [WARNING]  async_io requires the libraries: ['libaio-dev'] but are missing. Can be fixed by: `apt install libaio-dev`. -async_io ............... [NO] ....... [NO] - [WARNING]  async_io requires the libraries: ['libaio-dev'] but are missing. Can be fixed by: `apt install libaio-dev`. - [WARNING]  async_io requires the libraries: ['libaio-dev'] but are missing. Can be fixed by: `apt install libaio-dev`.transformer_inference .. -[NO] ....... [OKAY] -utils .................. [YES]async_io ...... ...............[OKAY] -async_io[NO] ...................... quantizer[NO][NO] - ..................... [NO][NO] -....... [OKAY] ---------------------------------------------------transformer_inference - .. [NO] ....... [OKAY]transformer_inference - .. [NO] utils....... ..................[OKAY] -[YES] ...... [OKAY] -utils .................. [YES]quantizer .................... [OKAY][NO] - ....... [OKAY] -quantizer .............. [NO]-------------------------------------------------- -....... [OKAY] --------------------------------------------------- -DeepSpeed general environment info:DeepSpeed general environment info: - -torch install pathtorch install path .............................. ['/gpfswork/rech/six/commun/conda/tr1-13B/lib/python3.8/site-packages/torch']['/gpfswork/rech/six/commun/conda/tr1-13B/lib/python3.8/site-packages/torch'] - -torch versiontorch version ........................................ 1.8.11.8.1 - -torch cuda versiontorch cuda version .............................. 11.111.1 - -nvcc versionnvcc version .......................................... 11.211.2 -deepspeed install path - deepspeed install path........... ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr1-13B/DeepSpeed-big-science/deepspeed'] -['/gpfsssd/worksf/projects/rech/six/commun/code/tr1-13B/DeepSpeed-big-science/deepspeed']deepspeed info - deepspeed info................... ...................0.4.2+bc17042, bc17042, big-science -0.4.2+bc17042, bc17042, big-science -deepspeed wheel compiled w.deepspeed wheel compiled w. ............ torch 1.8, cuda 11.1torch 1.8, cuda 11.1 - - [WARNING]  async_io requires the libraries: ['libaio-dev'] but are missing. Can be fixed by: `apt install libaio-dev`. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/tr1-13B/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr1-13B/DeepSpeed-big-science/deepspeed'] -deepspeed info ................... 0.4.2+bc17042, bc17042, big-science -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/tr1-13B/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -DeepSpeed general environment info: -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr1-13B/DeepSpeed-big-science/deepspeed'] -deepspeed info ................... 0.4.2+bc17042, bc17042, big-science -DeepSpeed general environment info:deepspeed wheel compiled w. ...... -torch 1.8, cuda 11.1 -torch install path ............... ['/gpfswork/rech/six/commun/conda/tr1-13B/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch install path ............... ['/gpfswork/rech/six/commun/conda/tr1-13B/lib/python3.8/site-packages/torch'] -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr1-13B/DeepSpeed-big-science/deepspeed'] -deepspeed info ................... 0.4.2+bc17042, bc17042, big-science -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr1-13B/DeepSpeed-big-science/deepspeed'] -deepspeed info ................... 0.4.2+bc17042, bc17042, big-science -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/tr1-13B/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr1-13B/DeepSpeed-big-science/deepspeed'] -deepspeed info ................... 0.4.2+bc17042, bc17042, big-science -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io requires the libraries: ['libaio-dev'] but are missing. Can be fixed by: `apt install libaio-dev`. [WARNING]  async_io requires the libraries: ['libaio-dev'] but are missing. Can be fixed by: `apt install libaio-dev`. - -async_io ............... [NO] .......async_io [NO] -............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -transformer_inference .. [NO] ....... utils[OKAY] -.................. utils .................. [YES] ...... [OKAY] -quantizer[YES] .................... [OKAY][NO] - ....... [OKAY] --------------------------------------------------- -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** - [WARNING]  async_io requires the libraries: ['libaio-dev'] but are missing. Can be fixed by: `apt install libaio-dev`. - [WARNING]  async_io requires the libraries: ['libaio-dev'] but are missing. Can be fixed by: `apt install libaio-dev`. -async_io ............... [NO] ....... [NO] -async_io ............... [NO] transformer_inference....... ..[NO] -[NO] ....... [OKAY] -utils .................. transformer_inference[YES] ........ [OKAY][NO] - ....... [OKAY] -quantizer .............. [NO] utils....... ..................[OKAY] -[YES] ......-------------------------------------------------- -[OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja ----------------------------------------------------------------------------------------------------- - -DeepSpeed C++/CUDA extension op reportDeepSpeed C++/CUDA extension op report - ----------------------------------------------------------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.-------------------------------------------------- - -JIT compiled ops requires ninja---------------------------------------------------------------------------------------------------- - - -JIT compiled ops requires ninja -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja -DeepSpeed general environment info:DeepSpeed general environment info: - -torch install path ...............torch install path ............... ['/gpfswork/rech/six/commun/conda/tr1-13B/lib/python3.8/site-packages/torch'] -['/gpfswork/rech/six/commun/conda/tr1-13B/lib/python3.8/site-packages/torch'] -torch version ....................torch version 1.8.1.................... - 1.8.1 -torch cuda version ...............torch cuda version ...............11.1 -11.1nvcc version - nvcc version..................... .....................11.2 -11.2deepspeed install path - deepspeed install path........... ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr1-13B/DeepSpeed-big-science/deepspeed'] -['/gpfsssd/worksf/projects/rech/six/commun/code/tr1-13B/DeepSpeed-big-science/deepspeed']deepspeed info - deepspeed info................... ...................0.4.2+bc17042, bc17042, big-science -0.4.2+bc17042, bc17042, big-sciencedeepspeed wheel compiled w. - deepspeed wheel compiled w....... ......torch 1.8, cuda 11.1 -torch 1.8, cuda 11.1 -ninjaninjaninjaninja ........................................................................ [OKAY][OKAY][OKAY][OKAY] - - - ------------------------------------------------------------------------------------------------------------------------------------------------------- --------------------------------------------------- - - -op nameop name op nameop name ................ ................................ ................ installedinstalled installed installed ...... ..compatiblecompatiblecompatible - - -compatible------------------------------------------------------------------------------------------------------------------------------------------------------ - - - --------------------------------------------------- -cpu_adamcpu_adam cpu_adam ...............cpu_adam ...............[YES]............... .....................[YES][YES] [OKAY]......[YES]...... - [OKAY] ...... -[OKAY] -[OKAY] -fused_adam ............. fused_adam[NO] fused_adamfused_adam............. ....... ............. ............. [NO][OKAY] -[NO][NO]....... fused_lamb.......[OKAY]....... -............. [OKAY] [OKAY][NO] -fused_lamb - .................... fused_lamb fused_lamb[OKAY][NO] -............. ............. ....... [NO] [NO] [OKAY] ....... -....... [OKAY][OKAY] - -sparse_attn ............ [NO] ....... [OKAY] -sparse_attn ............transformer [NO]............ sparse_attnsparse_attn ....... [NO] ............ ............[OKAY] ....... - [NO] [NO] [OKAY] transformer -.............. ............[OKAY][OKAY] [NO]stochastic_transformer - - ....... [OKAY]. -transformer transformer [NO] ............stochastic_transformer ................... [NO][OKAY][NO] - . ....... ....... [NO] [OKAY] [OKAY]....... - - [OKAY] -stochastic_transformerstochastic_transformer .. [NO][NO] .............. [OKAY][OKAY] - -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/tr1-13B/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr1-13B/DeepSpeed-big-science/deepspeed'] -deepspeed info ................... 0.4.2+bc17042, bc17042, big-science -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/tr1-13B/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr1-13B/DeepSpeed-big-science/deepspeed'] -deepspeed info ................... 0.4.2+bc17042, bc17042, big-science -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/tr1-13B/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr1-13B/DeepSpeed-big-science/deepspeed'] -deepspeed info ................... 0.4.2+bc17042, bc17042, big-science -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/tr1-13B/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr1-13B/DeepSpeed-big-science/deepspeed'] -deepspeed info ................... 0.4.2+bc17042, bc17042, big-science -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/tr1-13B/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr1-13B/DeepSpeed-big-science/deepspeed'] -deepspeed info ................... 0.4.2+bc17042, bc17042, big-science -DeepSpeed general environment info:deepspeed wheel compiled w. ...... -torch 1.8, cuda 11.1 -torch install path ............... ['/gpfswork/rech/six/commun/conda/tr1-13B/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 --------------------------------------------------- ---------------------------------------------------DeepSpeed C++/CUDA extension op report - ---------------------------------------------------DeepSpeed C++/CUDA extension op report-------------------------------------------------- - - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.DeepSpeed C++/CUDA extension op report-------------------------------------------------- - - -----------------------------------------------------------------------------------------------------NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - - -JIT compiled ops requires ninjaNOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr1-13B/DeepSpeed-big-science/deepspeed'] - meet the required dependencies to JIT install the op.-------------------------------------------------- - - ---------------------------------------------------JIT compiled ops requires ninja - -JIT compiled ops requires ninja -deepspeed info ................... 0.4.2+bc17042, bc17042, big-science -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -> setting codecarbon ... -ninjaninjaninjaninja ........................................................................ [OKAY][OKAY] -[OKAY] -----------------------------------------------------------------------------------------------------[OKAY] - - - -op nameop name---------------------------------------------------------------------------------------------------- - -................................op name op nameinstalled installed .................................... installedinstalledcompatiblecompatible -.. -..-------------------------------------------------- -------------------------------------------------- - compatible -compatible - ----------------------------------------------------------------------------------------------------- - -cpu_adam ............... cpu_adam[YES] .....................cpu_adam cpu_adam [YES][OKAY]............... - ......[YES]............... [OKAY]......[YES] - [OKAY]...... -fused_adam [OKAY]............. -fused_adam[NO] .................... fused_adam[NO][OKAY] -.................... fused_adamfused_lamb [OKAY] [NO]............. - .......[NO]............. fused_lamb ....... [OKAY] .............[NO][OKAY] - - [NO] .......fused_lamb....... [OKAY][OKAY]............. - - [NO] ....... sparse_attn[OKAY]fused_lamb - ......................... [NO] [NO]sparse_attn....... ............[OKAY]....... -sparse_attn [NO] transformer[OKAY] ........................ -....... [NO][NO][OKAY] -....... .......[OKAY]transformer - [OKAY]............sparse_attn stochastic_transformer - [NO] ............ . .......transformer [NO][NO]............ [OKAY] -.......[NO]....... [OKAY]stochastic_transformer[OKAY] - ....... - .[OKAY] transformer -[NO] ...................stochastic_transformer [OKAY] [NO] - ........ [NO][OKAY] -....... [OKAY]stochastic_transformer - . [NO] ....... [OKAY] -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** - [WARNING]  async_io requires the libraries: ['libaio-dev'] but are missing. Can be fixed by: `apt install libaio-dev`. [WARNING]  async_io requires the libraries: ['libaio-dev'] but are missing. Can be fixed by: `apt install libaio-dev`. - -async_ioasync_io .............................. [NO][NO] .............. [NO][NO] - -transformer_inference transformer_inference.. ..[NO] [NO]....... .......[OKAY] -[OKAY] -utils .................. [YES]utils ........................ [YES][OKAY] -...... [OKAY] -quantizer .............. [NO]quantizer ..................... [OKAY][NO] - ....... [OKAY] --------------------------------------------------- --------------------------------------------------- -/bin/sh: line 0: type: git: not found - [WARNING]  async_io requires the libraries: ['libaio-dev'] but are missing. Can be fixed by: `apt install libaio-dev`. [WARNING]  async_io requires the libraries: ['libaio-dev'] but are missing. Can be fixed by: `apt install libaio-dev`. - -async_ioasync_io ............... ...............[NO] [NO]....... .......[NO] -[NO] -transformer_inference .. [NO] transformer_inference....... ..[OKAY] -[NO] ....... [OKAY] -utils .................. [YES]utils ........................ [OKAY][YES] - ...... [OKAY] -quantizer .............. [NO]quantizer ..................... [OKAY][NO] - ....... [OKAY]-------------------------------------------------- - --------------------------------------------------- -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -/bin/sh: line 0: type: git: not found -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown ******** Git info for Megatron: git_hash=unknown git_branch=unknown **** - -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/tr1-13B/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr1-13B/DeepSpeed-big-science/deepspeed'] -deepspeed info ................... 0.4.2+bc17042, bc17042, big-science -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/tr1-13B/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr1-13B/DeepSpeed-big-science/deepspeed'] -deepspeed info ................... 0.4.2+bc17042, bc17042, big-science -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info:DeepSpeed general environment info: - -torch install pathtorch install path .............................. ['/gpfswork/rech/six/commun/conda/tr1-13B/lib/python3.8/site-packages/torch']['/gpfswork/rech/six/commun/conda/tr1-13B/lib/python3.8/site-packages/torch'] - -torch versiontorch version ........................................ 1.8.11.8.1 - -torch cuda versiontorch cuda version .............................. 11.111.1 - -nvcc versionnvcc version .......................................... 11.211.2 - -deepspeed install pathdeepspeed install path ...................... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr1-13B/DeepSpeed-big-science/deepspeed']['/gpfsssd/worksf/projects/rech/six/commun/code/tr1-13B/DeepSpeed-big-science/deepspeed'] - -deepspeed infodeepspeed info ...................................... 0.4.2+bc17042, bc17042, big-science0.4.2+bc17042, bc17042, big-science - -deepspeed wheel compiled w.deepspeed wheel compiled w. ............ torch 1.8, cuda 11.1torch 1.8, cuda 11.1 - -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -/bin/sh: line 0: type: git: not found -DeepSpeed general environment info: -DeepSpeed general environment info:torch install path - ............... torch install path ............... ['/gpfswork/rech/six/commun/conda/tr1-13B/lib/python3.8/site-packages/torch'] -['/gpfswork/rech/six/commun/conda/tr1-13B/lib/python3.8/site-packages/torch']torch version - .................... torch version1.8.1 -.................... torch cuda version 1.8.1............... - 11.1torch cuda version - nvcc version............... .....................11.1 -11.2nvcc version - deepspeed install path..................... ...........11.2 -deepspeed install path['/gpfsssd/worksf/projects/rech/six/commun/code/tr1-13B/DeepSpeed-big-science/deepspeed'] ........... - deepspeed info ['/gpfsssd/worksf/projects/rech/six/commun/code/tr1-13B/DeepSpeed-big-science/deepspeed']................... - deepspeed info0.4.2+bc17042, bc17042, big-science -...................deepspeed wheel compiled w. 0.4.2+bc17042, bc17042, big-science...... - deepspeed wheel compiled w.torch 1.8, cuda 11.1 -...... torch 1.8, cuda 11.1 -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -/bin/sh: line 0: type: git: not found -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown ******** Git info for Megatron: git_hash=unknown git_branch=unknown **** - -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/tr1-13B/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr1-13B/DeepSpeed-big-science/deepspeed'] -deepspeed info ................... 0.4.2+bc17042, bc17042, big-science -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/tr1-13B/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr1-13B/DeepSpeed-big-science/deepspeed'] -deepspeed info ................... 0.4.2+bc17042, bc17042, big-science -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 --------------------------------------------------- ---------------------------------------------------DeepSpeed C++/CUDA extension op report - --------------------------------------------------- -DeepSpeed C++/CUDA extension op reportNOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - ------------------------------------------------------------------------------------------------------------------------------------------------------- - ---------------------------------------------------JIT compiled ops requires ninjaNOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - - - -DeepSpeed C++/CUDA extension op reportDeepSpeed C++/CUDA extension op report-------------------------------------------------- - - ---------------------------------------------------JIT compiled ops requires ninja-------------------------------------------------- - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.-------------------------------------------------- - -JIT compiled ops requires ninja-------------------------------------------------- - -JIT compiled ops requires ninja -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** - [WARNING]  async_io requires the libraries: ['libaio-dev'] but are missing. Can be fixed by: `apt install libaio-dev`. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io requires the libraries: ['libaio-dev'] but are missing. Can be fixed by: `apt install libaio-dev`. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -ninjaninjaninjaninja ........................................................................ [OKAY][OKAY][OKAY] -[OKAY] --------------------------------------------------- - --------------------------------------------------- ----------------------------------------------------------------------------------------------------- -op name - -op name op name................ ................op name ................ installedinstalled................ installed .. installed ....compatible -compatible..compatible-------------------------------------------------- - --------------------------------------------------- -compatible --------------------------------------------------- - --------------------------------------------------- -cpu_adamcpu_adam ...............cpu_adam............... cpu_adam[YES] ............... [YES]...............[YES]...... ...... [OKAY][YES] ...... - [OKAY]......[OKAY] - - [OKAY] -fused_adam ............. [NO] fused_adamfused_adam.......fused_adam ............. ..........................[OKAY] [NO] - [NO][NO]....... fused_lamb..............[OKAY] - .............[OKAY] -[OKAY]fused_lamb[NO] - .............fused_lamb....... [NO].............[OKAY]fused_lamb - .......[NO]............. .......[NO][OKAY] [OKAY] -....... - [OKAY] -sparse_attn ............ [NO] ....... [OKAY] -sparse_attnsparse_attn transformer sparse_attn........................ [NO] ........................ [NO] ....... [NO][NO] ....... [OKAY] ....... - .......[OKAY][OKAY] - -transformer[OKAY] -............transformer stochastic_transformertransformer [NO] ............ ....................[NO] [NO] [NO][OKAY].............. - .......[OKAY][OKAY] - -stochastic_transformer[OKAY] -stochastic_transformer. stochastic_transformer[NO]. .......[NO] . [OKAY] ....... -[NO] [OKAY]....... - [OKAY] - [WARNING]  async_io requires the libraries: ['libaio-dev'] but are missing. Can be fixed by: `apt install libaio-dev`. - [WARNING]  async_io requires the libraries: ['libaio-dev'] but are missing. Can be fixed by: `apt install libaio-dev`. -async_io ............... [NO] ....... [NO] -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] .......transformer_inference [OKAY].. - [NO] ....... utils[OKAY] -.................. [YES] ...... [OKAY]utils - .................. quantizer[YES] .................... [NO][OKAY] -....... [OKAY] -quantizer ..............-------------------------------------------------- -[NO] ....... [OKAY] --------------------------------------------------- -/bin/sh: line 0: type: git: not found -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown ******** Git info for Megatron: git_hash=unknown git_branch=unknown **** - - [WARNING]  async_io requires the libraries: ['libaio-dev'] but are missing. Can be fixed by: `apt install libaio-dev`. [WARNING]  async_io requires the libraries: ['libaio-dev'] but are missing. Can be fixed by: `apt install libaio-dev`. - -async_io async_io............... ...............[NO] [NO]....... .......[NO] -[NO] -transformer_inferencetransformer_inference .... [NO][NO] .............. [OKAY][OKAY] - -utils ..................utils [YES].................. ......[YES] [OKAY]...... - [OKAY] -quantizer quantizer.............. ..............[NO] [NO]....... .......[OKAY] -[OKAY] --------------------------------------------------- --------------------------------------------------- - [WARNING]  async_io requires the libraries: ['libaio-dev'] but are missing. Can be fixed by: `apt install libaio-dev`. [WARNING]  async_io requires the libraries: ['libaio-dev'] but are missing. Can be fixed by: `apt install libaio-dev`. - -async_ioasync_io .............................. [NO][NO] .............. [NO][NO] - -transformer_inference .. [NO]transformer_inference ......... [OKAY][NO] - ....... [OKAY] -utils ..................utils [YES].................. ......[YES] [OKAY]...... - [OKAY] -quantizer .............. quantizer[NO] ..................... [NO][OKAY] -....... [OKAY] --------------------------------------------------- --------------------------------------------------- -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/tr1-13B/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1DeepSpeed general environment info: -nvcc version ..................... - 11.2 -deepspeed install path ...........torch install path ...............['/gpfsssd/worksf/projects/rech/six/commun/code/tr1-13B/DeepSpeed-big-science/deepspeed'] -deepspeed info ................... 0.4.2+bc17042, bc17042, big-science -['/gpfswork/rech/six/commun/conda/tr1-13B/lib/python3.8/site-packages/torch']deepspeed wheel compiled w. - ...... torch versiontorch 1.8, cuda 11.1 -.................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr1-13B/DeepSpeed-big-science/deepspeed'] -deepspeed info ................... 0.4.2+bc17042, bc17042, big-science -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -/bin/sh: line 0: type: git: not found -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown ******** Git info for Megatron: git_hash=unknown git_branch=unknown **** - -DeepSpeed general environment info: -DeepSpeed general environment info:torch install path - ............... torch install path ............... ['/gpfswork/rech/six/commun/conda/tr1-13B/lib/python3.8/site-packages/torch'] -torch version ....................['/gpfswork/rech/six/commun/conda/tr1-13B/lib/python3.8/site-packages/torch'] -1.8.1 -torch versiontorch cuda version ................................... 1.8.111.1 - -nvcc versiontorch cuda version .................................... 11.211.1 - -deepspeed install path nvcc version........... ..................... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr1-13B/DeepSpeed-big-science/deepspeed']11.2 - -deepspeed infodeepspeed install path .............................. 0.4.2+bc17042, bc17042, big-science['/gpfsssd/worksf/projects/rech/six/commun/code/tr1-13B/DeepSpeed-big-science/deepspeed'] - -deepspeed wheel compiled w.deepspeed info ......................... torch 1.8, cuda 11.10.4.2+bc17042, bc17042, big-science - -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -/bin/sh: line 0: type: git: not found -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown ******** Git info for Megatron: git_hash=unknown git_branch=unknown **** - -/bin/sh: line 0: type: git: not found -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** - [WARNING]  async_io requires the libraries: ['libaio-dev'] but are missing. Can be fixed by: `apt install libaio-dev`. [WARNING]  async_io requires the libraries: ['libaio-dev'] but are missing. Can be fixed by: `apt install libaio-dev`. - -async_io ...............async_io [NO] ...................... [NO][NO] -....... [NO] -transformer_inference .. [NO]transformer_inference ....... ..[OKAY] -[NO] ....... [OKAY] -utils .................. [YES] utils...... ..................[OKAY] -[YES] ...... [OKAY] -quantizer .............. quantizer[NO] ..................... [NO][OKAY] -....... [OKAY] --------------------------------------------------- --------------------------------------------------- -DeepSpeed general environment info:DeepSpeed general environment info: - -torch install pathtorch install path .............................. ['/gpfswork/rech/six/commun/conda/tr1-13B/lib/python3.8/site-packages/torch']['/gpfswork/rech/six/commun/conda/tr1-13B/lib/python3.8/site-packages/torch'] - -torch versiontorch version ........................................ 1.8.11.8.1 - -torch cuda versiontorch cuda version .............................. 11.111.1 - -nvcc versionnvcc version .......................................... 11.211.2 - -deepspeed install pathdeepspeed install path ...................... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr1-13B/DeepSpeed-big-science/deepspeed']['/gpfsssd/worksf/projects/rech/six/commun/code/tr1-13B/DeepSpeed-big-science/deepspeed'] - -deepspeed infodeepspeed info ...................................... 0.4.2+bc17042, bc17042, big-science0.4.2+bc17042, bc17042, big-science - -deepspeed wheel compiled w.deepspeed wheel compiled w. ............ torch 1.8, cuda 11.1torch 1.8, cuda 11.1 - -DeepSpeed general environment info: -torch install pathDeepSpeed general environment info: ............... -torch install path ['/gpfswork/rech/six/commun/conda/tr1-13B/lib/python3.8/site-packages/torch']............... - torch version .................... 1.8.1 -['/gpfswork/rech/six/commun/conda/tr1-13B/lib/python3.8/site-packages/torch'] -torch cuda version ...............torch version 11.1.................... - nvcc version1.8.1 -..................... 11.2torch cuda version - deepspeed install path............... ...........11.1 -nvcc version['/gpfsssd/worksf/projects/rech/six/commun/code/tr1-13B/DeepSpeed-big-science/deepspeed'] ..................... - deepspeed info11.2 -...................deepspeed install path 0.4.2+bc17042, bc17042, big-science........... - deepspeed wheel compiled w. ['/gpfsssd/worksf/projects/rech/six/commun/code/tr1-13B/DeepSpeed-big-science/deepspeed']...... - deepspeed infotorch 1.8, cuda 11.1 -................... 0.4.2+bc17042, bc17042, big-science -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io requires the libraries: ['libaio-dev'] but are missing. Can be fixed by: `apt install libaio-dev`. - [WARNING]  async_io requires the libraries: ['libaio-dev'] but are missing. Can be fixed by: `apt install libaio-dev`. -async_io ............... [NO] ....... [NO] -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -transformer_inference utils.. ..................[NO] [YES]....... ......[OKAY] -[OKAY] -quantizerutils ................................ [NO][YES] ............. [OKAY][OKAY] - ---------------------------------------------------quantizer - .............. [NO] ....... [OKAY] --------------------------------------------------- -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -DeepSpeed general environment info: -DeepSpeed general environment info:torch install path -............... torch install path ['/gpfswork/rech/six/commun/conda/tr1-13B/lib/python3.8/site-packages/torch']............... - torch version ....................['/gpfswork/rech/six/commun/conda/tr1-13B/lib/python3.8/site-packages/torch'] 1.8.1 - -torch versiontorch cuda version ................................... 1.8.111.1 - -nvcc versiontorch cuda version .................................... 11.211.1 - -deepspeed install pathnvcc version ................................ 11.2['/gpfsssd/worksf/projects/rech/six/commun/code/tr1-13B/DeepSpeed-big-science/deepspeed'] - -deepspeed install pathdeepspeed info .............................. 0.4.2+bc17042, bc17042, big-science['/gpfsssd/worksf/projects/rech/six/commun/code/tr1-13B/DeepSpeed-big-science/deepspeed'] - -deepspeed wheel compiled w.deepspeed info ......................... torch 1.8, cuda 11.10.4.2+bc17042, bc17042, big-science - -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info:DeepSpeed general environment info: - -torch install path torch install path............... ............... ['/gpfswork/rech/six/commun/conda/tr1-13B/lib/python3.8/site-packages/torch'] -['/gpfswork/rech/six/commun/conda/tr1-13B/lib/python3.8/site-packages/torch'] -torch version torch version.................... ....................1.8.1 - 1.8.1 -torch cuda version ...............torch cuda version 11.1............... - nvcc version11.1 -.....................nvcc version 11.2..................... - deepspeed install path11.2 -...........deepspeed install path ...........['/gpfsssd/worksf/projects/rech/six/commun/code/tr1-13B/DeepSpeed-big-science/deepspeed'] -deepspeed info['/gpfsssd/worksf/projects/rech/six/commun/code/tr1-13B/DeepSpeed-big-science/deepspeed'] -................... deepspeed info 0.4.2+bc17042, bc17042, big-science................... - deepspeed wheel compiled w.0.4.2+bc17042, bc17042, big-science -......deepspeed wheel compiled w. torch 1.8, cuda 11.1...... - torch 1.8, cuda 11.1 -/bin/sh: line 0: type: git: not found -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown ******** Git info for Megatron: git_hash=unknown git_branch=unknown **** - --------------------------------------------------- -DeepSpeed C++/CUDA extension op report-------------------------------------------------- - -----------------------------------------------------------------------------------------------------DeepSpeed C++/CUDA extension op report -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - - ---------------------------------------------------DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja ----------------------------------------------------------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - - -JIT compiled ops requires ninjaDeepSpeed C++/CUDA extension op report-------------------------------------------------- - - -JIT compiled ops requires ninja-------------------------------------------------- - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -JIT compiled ops requires ninja --------------------------------------------------- -DeepSpeed C++/CUDA extension op report --------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. ----------------------------------------------------------------------------------------------------- -JIT compiled ops requires ninja - ---------------------------------------------------DeepSpeed C++/CUDA extension op report - ---------------------------------------------------DeepSpeed C++/CUDA extension op report - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.---------------------------------------------------------------------------------------------------- - - ---------------------------------------------------NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.DeepSpeed C++/CUDA extension op report - - -JIT compiled ops requires ninja---------------------------------------------------------------------------------------------------- - - -JIT compiled ops requires ninjaNOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - --------------------------------------------------- -JIT compiled ops requires ninja -/bin/sh: line 0: type: git: not found -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -ninjaninjaninjaninja ...................................................... .................. [OKAY] [OKAY][OKAY] -[OKAY] - - --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- - - - -op nameop nameop nameop name ................ ................................................installed installedinstalledinstalled.. ....compatible.. - -------------------------------------------------- compatiblecompatiblecompatible - - - ------------------------------------------------------------------------------------------------------------------------------------------------------- - - -cpu_adam ............... [YES] cpu_adam......cpu_adam cpu_adam ...............[OKAY] ............... -............... [YES][YES][YES] ...... ...... ...... [OKAY]fused_adam [OKAY] - [OKAY] -............. - [NO] ....... [OKAY] -fused_adam fused_lamb............. fused_adam............. fused_adam............. [NO][NO] ............. [NO].............. .......[NO][OKAY][OKAY] -[OKAY] -....... -fused_lamb [OKAY]............. - fused_lamb[NO] fused_lamb ............. .......[NO].............sparse_attn ....... [NO] [OKAY]............ [OKAY] -[NO] -....... .......[OKAY] -[OKAY] -transformer ............sparse_attn [NO] sparse_attn................... ............sparse_attn [NO][OKAY] [NO] ....... -............ .......[OKAY]stochastic_transformer [NO][OKAY] - - .......transformer.transformer [NO][OKAY]........................ - ....... [OKAY] -[NO]transformer[NO] .......................... [OKAY][NO][OKAY] - -....... [OKAY] -stochastic_transformerstochastic_transformer ..stochastic_transformer [NO][NO] ........ [NO].......[OKAY] -.......[OKAY] -[OKAY] -/bin/sh: line 0: type: git: not found -/bin/sh: line 0: type: git: not found -ninjaninjaninjaninja ...................................................... .................. [OKAY] [OKAY][OKAY] -[OKAY] - - ----------------------------------------------------------------------------------------------------- - -----------------------------------------------------------------------------------------------------op name -op name - op name ................op name ................................ installed installed installed ...................... compatibleinstalledcompatible -compatible -.. ----------------------------------------------------------------------------------------------------- -------------------------------------------------- - - -compatible --------------------------------------------------- -cpu_adamcpu_adam ...............cpu_adam cpu_adam[YES]............... ............... ...... ...............[YES][OKAY][YES] - [YES]............ ......[OKAY][OKAY] - -[OKAY] -fused_adam ............. [NO] ....... [OKAY]fused_adam - fused_adam.............fused_adam fused_lamb .............[NO]............. .................... [NO][OKAY] -[NO] [NO].............. fused_lamb [OKAY].......[OKAY] -[OKAY]............. - - fused_lamb[NO]fused_lamb ................................. [NO][OKAY][NO] - ..............sparse_attn [OKAY][OKAY]............ - - [NO] ....... sparse_attn[OKAY] -............ [NO] transformer....... sparse_attn............[OKAY]sparse_attn -**** Git info for Megatron: git_hash=unknown git_branch=unknown ******** Git info for Megatron: git_hash=unknown git_branch=unknown **** - -............ [NO] ............ transformer[NO] .......[NO]....... ............[OKAY]....... [OKAY][OKAY] - - -[NO] .......transformertransformer stochastic_transformer [OKAY]............ -............ . [NO] [NO][NO].......stochastic_transformer ....... ....... .[OKAY] -[OKAY] -[OKAY][NO]stochastic_transformer - ....... [OKAY]. -stochastic_transformer [NO] ........ [OKAY][NO] - ....... [OKAY] ----------------------------------------------------------------------------------------------------- --------------------------------------------------- -DeepSpeed C++/CUDA extension op report -DeepSpeed C++/CUDA extension op report --------------------------------------------------- - ---------------------------------------------------DeepSpeed C++/CUDA extension op report--------------------------------------------------DeepSpeed C++/CUDA extension op report - - - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op.--------------------------------------------------NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- --------------------------------------------------- - -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. ---------------------------------------------------NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. -JIT compiled ops requires ninja - --------------------------------------------------- -JIT compiled ops requires ninja-------------------------------------------------- - - -JIT compiled ops requires ninjaJIT compiled ops requires ninja - -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/tr1-13B/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr1-13B/DeepSpeed-big-science/deepspeed'] -deepspeed info ................... 0.4.2+bc17042, bc17042, big-science -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -ninjaninjaninjaninja .................................... .................. ..................[OKAY][OKAY] - [OKAY] -[OKAY]-------------------------------------------------- - --------------------------------------------------- - -----------------------------------------------------------------------------------------------------op name -op name - op name................................op name installed................installed................ .. ..installed installedcompatiblecompatible - -....---------------------------------------------------------------------------------------------------- - -compatiblecompatible - ----------------------------------------------------------------------------------------------------- - -cpu_adam cpu_adam............... ...............[YES] [YES]cpu_adamcpu_adam...... ...... [OKAY]..............................[OKAY] - -[YES][YES] ............ [OKAY][OKAY] - -fused_adamfused_adam .......................... [NO][NO] fused_adam..............fused_adam [OKAY]..........................[OKAY] - -[NO][NO] ..............fused_lamb fused_lamb[OKAY][OKAY]............. - - .............[NO] fused_lamb.......[NO]fused_lamb ....... .............[OKAY] .............[OKAY] - -[NO][NO] .............. [OKAY][OKAY] - -sparse_attnsparse_attn ........................ [NO][NO] .............. [OKAY][OKAY]sparse_attn -sparse_attn - ............transformer............ transformer[NO]............[NO] [NO] ............ .............. ....... [OKAY][NO] [OKAY] - [OKAY] -....... -transformer [OKAY]transformer............ -stochastic_transformer ............[NO] .[NO] .......stochastic_transformer [NO] ....... [OKAY]........ - [OKAY][OKAY] -[NO]stochastic_transformer - ....... [OKAY].stochastic_transformer - [NO]. .......[NO] [OKAY]....... - [OKAY] -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/tr1-13B/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr1-13B/DeepSpeed-big-science/deepspeed'] -deepspeed info ................... 0.4.2+bc17042, bc17042, big-science -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -/bin/sh: line 0: type: git: not found -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown ******** Git info for Megatron: git_hash=unknown git_branch=unknown **** - -/bin/sh: line 0: type: git: not found -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** - [WARNING]  async_io requires the libraries: ['libaio-dev'] but are missing. Can be fixed by: `apt install libaio-dev`. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** - [WARNING]  async_io requires the libraries: ['libaio-dev'] but are missing. Can be fixed by: `apt install libaio-dev`. - [WARNING]  async_io requires the libraries: ['libaio-dev'] but are missing. Can be fixed by: `apt install libaio-dev`. -async_io ............... [NO] ....... [NO]async_io - ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -transformer_inference .. [NO] .......utils [OKAY].................. - [YES] ...... [OKAY]utils - .................. [YES] ......quantizer [OKAY].............. - [NO] .......quantizer [OKAY].............. - [NO] ....... --------------------------------------------------[OKAY] - --------------------------------------------------- -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/tr1-13B/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr1-13B/DeepSpeed-big-science/deepspeed'] -deepspeed info ................... 0.4.2+bc17042, bc17042, big-science -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io requires the libraries: ['libaio-dev'] but are missing. Can be fixed by: `apt install libaio-dev`. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io requires the libraries: ['libaio-dev'] but are missing. Can be fixed by: `apt install libaio-dev`. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/tr1-13B/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr1-13B/DeepSpeed-big-science/deepspeed'] -deepspeed info ................... 0.4.2+bc17042, bc17042, big-science -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 - [WARNING]  async_io requires the libraries: ['libaio-dev'] but are missing. Can be fixed by: `apt install libaio-dev`. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io requires the libraries: ['libaio-dev'] but are missing. Can be fixed by: `apt install libaio-dev`. [WARNING]  async_io requires the libraries: ['libaio-dev'] but are missing. Can be fixed by: `apt install libaio-dev`. - -async_ioasync_io .............................. [NO][NO] .............. [NO][NO] - -transformer_inferencetransformer_inference .... [NO][NO] .............. [OKAY][OKAY] - -utils ..................utils [YES].................. ......[YES] [OKAY]...... - [OKAY] -quantizer quantizer.............. ..............[NO] [NO]....... .......[OKAY] -[OKAY] --------------------------------------------------- --------------------------------------------------- - [WARNING]  async_io requires the libraries: ['libaio-dev'] but are missing. Can be fixed by: `apt install libaio-dev`. - [WARNING]  async_io requires the libraries: ['libaio-dev'] but are missing. Can be fixed by: `apt install libaio-dev`. -async_io ............... [NO] .......async_io [NO] -............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -transformer_inference .. [NO] utils....... [OKAY].................. - [YES] ...... [OKAY] -utils quantizer.................. ..............[YES] [NO]...... .......[OKAY] -[OKAY] ---------------------------------------------------quantizer - .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io requires the libraries: ['libaio-dev'] but are missing. Can be fixed by: `apt install libaio-dev`. [WARNING]  async_io requires the libraries: ['libaio-dev'] but are missing. Can be fixed by: `apt install libaio-dev`. - -async_ioasync_io .............................. [NO][NO] .............. [NO][NO] - -transformer_inferencetransformer_inference .... [NO][NO] .............. [OKAY][OKAY] - -utils utils.................. ..................[YES] [YES]...... ......[OKAY] -[OKAY] -quantizerquantizer ............................ [NO][NO] .............. [OKAY][OKAY] - --------------------------------------------------- --------------------------------------------------- -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/tr1-13B/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr1-13B/DeepSpeed-big-science/deepspeed'] -deepspeed info ................... 0.4.2+bc17042, bc17042, big-science -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -DeepSpeed general environment info:DeepSpeed general environment info: - -torch install path torch install path............... ............... ['/gpfswork/rech/six/commun/conda/tr1-13B/lib/python3.8/site-packages/torch']['/gpfswork/rech/six/commun/conda/tr1-13B/lib/python3.8/site-packages/torch'] - -torch versiontorch version ........................................ 1.8.11.8.1 - -torch cuda versiontorch cuda version .............................. 11.111.1 - -nvcc versionnvcc version .......................................... 11.211.2 - -deepspeed install pathdeepspeed install path ...................... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr1-13B/DeepSpeed-big-science/deepspeed']['/gpfsssd/worksf/projects/rech/six/commun/code/tr1-13B/DeepSpeed-big-science/deepspeed'] - -deepspeed infodeepspeed info ...................................... 0.4.2+bc17042, bc17042, big-science0.4.2+bc17042, bc17042, big-science - -deepspeed wheel compiled w.deepspeed wheel compiled w. ...... ......torch 1.8, cuda 11.1 -torch 1.8, cuda 11.1 -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/tr1-13B/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr1-13B/DeepSpeed-big-science/deepspeed'] -deepspeed info ................... 0.4.2+bc17042, bc17042, big-science -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/tr1-13B/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr1-13B/DeepSpeed-big-science/deepspeed'] -deepspeed info ................... 0.4.2+bc17042, bc17042, big-science -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info:DeepSpeed general environment info: - -torch install pathtorch install path .............................. ['/gpfswork/rech/six/commun/conda/tr1-13B/lib/python3.8/site-packages/torch']['/gpfswork/rech/six/commun/conda/tr1-13B/lib/python3.8/site-packages/torch'] - -torch versiontorch version ........................................ 1.8.11.8.1 - -torch cuda versiontorch cuda version .............................. 11.111.1 - -nvcc versionnvcc version .......................................... 11.211.2 - -deepspeed install pathdeepspeed install path ...................... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr1-13B/DeepSpeed-big-science/deepspeed']['/gpfsssd/worksf/projects/rech/six/commun/code/tr1-13B/DeepSpeed-big-science/deepspeed'] - -deepspeed infodeepspeed info ...................................... 0.4.2+bc17042, bc17042, big-science0.4.2+bc17042, bc17042, big-science - -deepspeed wheel compiled w.deepspeed wheel compiled w. ............ torch 1.8, cuda 11.1torch 1.8, cuda 11.1 - -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/tr1-13B/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr1-13B/DeepSpeed-big-science/deepspeed'] -deepspeed info ................... 0.4.2+bc17042, bc17042, big-science -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info:DeepSpeed general environment info: - -torch install pathtorch install path .............................. ['/gpfswork/rech/six/commun/conda/tr1-13B/lib/python3.8/site-packages/torch']['/gpfswork/rech/six/commun/conda/tr1-13B/lib/python3.8/site-packages/torch'] - -torch versiontorch version ........................................ 1.8.11.8.1 - -torch cuda version torch cuda version............... ...............11.1 -11.1nvcc version - .....................nvcc version 11.2..................... - deepspeed install path11.2 -...........deepspeed install path ...........['/gpfsssd/worksf/projects/rech/six/commun/code/tr1-13B/DeepSpeed-big-science/deepspeed'] -deepspeed info['/gpfsssd/worksf/projects/rech/six/commun/code/tr1-13B/DeepSpeed-big-science/deepspeed'] -...................deepspeed info 0.4.2+bc17042, bc17042, big-science................... - deepspeed wheel compiled w.0.4.2+bc17042, bc17042, big-science -......deepspeed wheel compiled w. torch 1.8, cuda 11.1...... - torch 1.8, cuda 11.1 -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/tr1-13B/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr1-13B/DeepSpeed-big-science/deepspeed'] -deepspeed info ................... 0.4.2+bc17042, bc17042, big-science -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/tr1-13B/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr1-13B/DeepSpeed-big-science/deepspeed'] -deepspeed info ................... 0.4.2+bc17042, bc17042, big-science -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -> initializing torch distributed ... -/bin/sh: line 0: type: git: not found -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -/bin/sh: line 0: type: git: not found -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** --------------------------------------------------- -DeepSpeed C++/CUDA extension op report ----------------------------------------------------------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. --------------------------------------------------- -DeepSpeed C++/CUDA extension op report-------------------------------------------------- - - -DeepSpeed C++/CUDA extension op report--------------------------------------------------JIT compiled ops requires ninja --------------------------------------------------- - ---------------------------------------------------NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - - -DeepSpeed C++/CUDA extension op report-------------------------------------------------- -NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. ---------------------------------------------------JIT compiled ops requires ninja - - ---------------------------------------------------NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - --------------------------------------------------- -JIT compiled ops requires ninjaJIT compiled ops requires ninja - -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -ninjaninjaninjaninja ...................................................... .................. [OKAY] [OKAY][OKAY] - -[OKAY] --------------------------------------------------- --------------------------------------------------- ----------------------------------------------------------------------------------------------------- -op name - - op name op nameop name................ ................ installed................ ................installed .. installed ..installed..compatible -compatiblecompatible..-------------------------------------------------- - - - --------------------------------------------------compatible-------------------------------------------------- - - --------------------------------------------------- -cpu_adam ............... [YES] ......cpu_adam cpu_adamcpu_adam [OKAY] ............... -.............................. [YES][YES] [YES] ...... ...... fused_adam...... [OKAY] -[OKAY].............[OKAY] - -[NO] ....... [OKAY] -fused_adam ............. fused_lamb[NO]fused_adam fused_adam ............. .................... ............. [NO] [OKAY].......[NO][NO] [OKAY] - ....... -fused_lamb....... [OKAY]............. [OKAY] -[NO] - .......fused_lamb [OKAY]fused_lambsparse_attn -............. .........................[NO] [NO][NO]....... ..............[OKAY] -sparse_attn[OKAY][OKAY] - -............ transformer[NO] ................... [NO][OKAY] -....... sparse_attntransformer[OKAY] -............sparse_attn ............ [NO] ............ stochastic_transformer[NO]....... [NO] [OKAY] .............. - . [OKAY][OKAY]stochastic_transformer -[NO] - transformer........ [OKAY]transformer[NO]............ - ...................[NO] [NO] [OKAY] ....... -....... [OKAY][OKAY] - -stochastic_transformerstochastic_transformer .. [NO][NO] .............. [OKAY][OKAY] - - [WARNING]  async_io requires the libraries: ['libaio-dev'] but are missing. Can be fixed by: `apt install libaio-dev`. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io requires the libraries: ['libaio-dev'] but are missing. Can be fixed by: `apt install libaio-dev`. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -> setting tensorboard ... - [WARNING]  async_io requires the libraries: ['libaio-dev'] but are missing. Can be fixed by: `apt install libaio-dev`. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- - [WARNING]  async_io requires the libraries: ['libaio-dev'] but are missing. Can be fixed by: `apt install libaio-dev`. -async_io ............... [NO] ....... [NO] -transformer_inference .. [NO] ....... [OKAY] -utils .................. [YES] ...... [OKAY] -quantizer .............. [NO] ....... [OKAY] --------------------------------------------------- -DeepSpeed general environment info:DeepSpeed general environment info: - -torch install pathtorch install path .............................. ['/gpfswork/rech/six/commun/conda/tr1-13B/lib/python3.8/site-packages/torch']['/gpfswork/rech/six/commun/conda/tr1-13B/lib/python3.8/site-packages/torch'] - -torch versiontorch version ........................................ 1.8.11.8.1 - -torch cuda version torch cuda version............... ...............11.1 -11.1nvcc version - nvcc version..................... .....................11.2 -11.2deepspeed install path - deepspeed install path........... ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr1-13B/DeepSpeed-big-science/deepspeed'] -['/gpfsssd/worksf/projects/rech/six/commun/code/tr1-13B/DeepSpeed-big-science/deepspeed']deepspeed info - deepspeed info................... ...................0.4.2+bc17042, bc17042, big-science -0.4.2+bc17042, bc17042, big-sciencedeepspeed wheel compiled w. - deepspeed wheel compiled w....... ......torch 1.8, cuda 11.1 -torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/tr1-13B/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr1-13B/DeepSpeed-big-science/deepspeed'] -deepspeed info ................... 0.4.2+bc17042, bc17042, big-science -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -DeepSpeed general environment info: -torch install path ............... ['/gpfswork/rech/six/commun/conda/tr1-13B/lib/python3.8/site-packages/torch'] -torch version .................... 1.8.1 -torch cuda version ............... 11.1 -nvcc version ..................... 11.2 -deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr1-13B/DeepSpeed-big-science/deepspeed'] -deepspeed info ................... 0.4.2+bc17042, bc17042, big-science -deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1 -/bin/sh: line 0: type: git: not found -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown ******** Git info for Megatron: git_hash=unknown git_branch=unknown **** - -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -/bin/sh: line 0: type: git: not found -**** Git info for Megatron: git_hash=unknown git_branch=unknown **** -> initializing tensor model parallel with size 4 -> initializing pipeline model parallel with size 8 -> setting random seeds to 42 ... -[2021-09-24 02:11:21,021] [INFO] [checkpointing.py:226:model_parallel_cuda_manual_seed] > initializing model parallel cuda seeds on global rank 0, model parallel rank 0, and data parallel rank 0 with model parallel seed: 2760 and data parallel seed: 42 -> compiling dataset index builder ... -make: Entering directory '/gpfsssd/worksf/projects/rech/six/commun/code/tr8-104B/Megatron-DeepSpeed-tr8-104B/megatron/data' -make: Nothing to be done for 'default'. -make: Leaving directory '/gpfsssd/worksf/projects/rech/six/commun/code/tr8-104B/Megatron-DeepSpeed-tr8-104B/megatron/data' ->>> done with dataset index builder. Compilation time: 0.307 seconds -> compiling and loading fused kernels ... -/gpfswork/rech/six/commun/conda/tr1-13B/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( -Detected CUDA files, patching ldflags -Emitting ninja build file /gpfsssd/worksf/projects/rech/six/commun/code/tr8-104B/Megatron-DeepSpeed-tr8-104B/megatron/fused_kernels/build/build.ninja... -Building extension module scaled_upper_triang_masked_softmax_cuda... -Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N) -ninja: no work to do. -Loading extension module scaled_upper_triang_masked_softmax_cuda... -/gpfswork/rech/six/commun/conda/tr1-13B/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( -Detected CUDA files, patching ldflags -Emitting ninja build file /gpfsssd/worksf/projects/rech/six/commun/code/tr8-104B/Megatron-DeepSpeed-tr8-104B/megatron/fused_kernels/build/build.ninja... -Building extension module scaled_masked_softmax_cuda... -Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N) -ninja: no work to do. -Loading extension module scaled_masked_softmax_cuda... -/gpfswork/rech/six/commun/conda/tr1-13B/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( -Detected CUDA files, patching ldflags -Emitting ninja build file /gpfsssd/worksf/projects/rech/six/commun/code/tr8-104B/Megatron-DeepSpeed-tr8-104B/megatron/fused_kernels/build/build.ninja... -Building extension module fused_mix_prec_layer_norm_cuda... -Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N) -ninja: no work to do. -Loading extension module fused_mix_prec_layer_norm_cuda... -/gpfswork/rech/six/commun/conda/tr1-13B/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( -/gpfswork/rech/six/commun/conda/tr1-13B/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( -/gpfswork/rech/six/commun/conda/tr1-13B/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( -/gpfswork/rech/six/commun/conda/tr1-13B/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( -/gpfswork/rech/six/commun/conda/tr1-13B/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( -/gpfswork/rech/six/commun/conda/tr1-13B/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( -/gpfswork/rech/six/commun/conda/tr1-13B/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( -/gpfswork/rech/six/commun/conda/tr1-13B/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( -/gpfswork/rech/six/commun/conda/tr1-13B/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( -/gpfswork/rech/six/commun/conda/tr1-13B/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( -/gpfswork/rech/six/commun/conda/tr1-13B/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( -/gpfswork/rech/six/commun/conda/tr1-13B/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( -/gpfswork/rech/six/commun/conda/tr1-13B/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( -/gpfswork/rech/six/commun/conda/tr1-13B/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( -/gpfswork/rech/six/commun/conda/tr1-13B/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( -/gpfswork/rech/six/commun/conda/tr1-13B/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( -/gpfswork/rech/six/commun/conda/tr1-13B/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( -/gpfswork/rech/six/commun/conda/tr1-13B/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( -/gpfswork/rech/six/commun/conda/tr1-13B/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( -/gpfswork/rech/six/commun/conda/tr1-13B/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( -/gpfswork/rech/six/commun/conda/tr1-13B/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( -/gpfswork/rech/six/commun/conda/tr1-13B/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( -/gpfswork/rech/six/commun/conda/tr1-13B/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( -/gpfswork/rech/six/commun/conda/tr1-13B/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( -/gpfswork/rech/six/commun/conda/tr1-13B/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( -/gpfswork/rech/six/commun/conda/tr1-13B/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( -/gpfswork/rech/six/commun/conda/tr1-13B/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( -/gpfswork/rech/six/commun/conda/tr1-13B/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( -/gpfswork/rech/six/commun/conda/tr1-13B/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( -/gpfswork/rech/six/commun/conda/tr1-13B/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( -/gpfswork/rech/six/commun/conda/tr1-13B/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( -/gpfswork/rech/six/commun/conda/tr1-13B/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( -/gpfswork/rech/six/commun/conda/tr1-13B/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( -/gpfswork/rech/six/commun/conda/tr1-13B/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( -/gpfswork/rech/six/commun/conda/tr1-13B/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( -/gpfswork/rech/six/commun/conda/tr1-13B/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( -/gpfswork/rech/six/commun/conda/tr1-13B/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( -/gpfswork/rech/six/commun/conda/tr1-13B/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( -/gpfswork/rech/six/commun/conda/tr1-13B/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( -/gpfswork/rech/six/commun/conda/tr1-13B/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( -/gpfswork/rech/six/commun/conda/tr1-13B/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( -/gpfswork/rech/six/commun/conda/tr1-13B/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: - - !! WARNING !! - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Your compiler (c++) is not compatible with the compiler Pytorch was -built with for this platform, which is g++ on linux. Please -use g++ to to compile your extension. Alternatively, you may -compile PyTorch from source using c++, and then you can also use -c++ to compile your extension. - -See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help -with compiling PyTorch from source. -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - !! WARNING !! - - warnings.warn(WRONG_COMPILER_WARNING.format( ->>> done with compiling and loading fused kernels. Compilation time: 27.029 seconds -time to initialize megatron (seconds): 68.551 -[after megatron is initialized] datetime: 2021-09-24 02:11:48 -building GPT model ... -[2021-09-24 02:11:48,760] [INFO] [utils.py:680:see_memory_usage] Before Building Model -/gpfswork/rech/six/commun/conda/tr1-13B/lib/python3.8/site-packages/torch/cuda/memory.py:373: FutureWarning: torch.cuda.memory_cached has been renamed to torch.cuda.memory_reserved - warnings.warn( -/gpfswork/rech/six/commun/conda/tr1-13B/lib/python3.8/site-packages/torch/cuda/memory.py:381: FutureWarning: torch.cuda.max_memory_cached has been renamed to torch.cuda.max_memory_reserved - warnings.warn( -[2021-09-24 02:11:48,762] [INFO] [utils.py:681:see_memory_usage] MA 0.0 GB Max_MA 0.0 GB CA 0.0 GB Max_CA 0 GB -[2021-09-24 02:11:48,763] [INFO] [utils.py:689:see_memory_usage] CPU Virtual Memory: used = 37.77 GB, percent = 20.2% -SEED_LAYERS=False BASE_SEED=1234 SEED_FN=None -Using topology: {ProcessCoord(pipe=0, data=0, model=0): 0, ProcessCoord(pipe=0, data=0, model=1): 1, ProcessCoord(pipe=0, data=0, model=2): 2, ProcessCoord(pipe=0, data=0, model=3): 3, ProcessCoord(pipe=0, data=1, model=0): 4, ProcessCoord(pipe=0, data=1, model=1): 5, ProcessCoord(pipe=0, data=1, model=2): 6, ProcessCoord(pipe=0, data=1, model=3): 7, ProcessCoord(pipe=0, data=2, model=0): 8, ProcessCoord(pipe=0, data=2, model=1): 9, ProcessCoord(pipe=0, data=2, model=2): 10, ProcessCoord(pipe=0, data=2, model=3): 11, ProcessCoord(pipe=0, data=3, model=0): 12, ProcessCoord(pipe=0, data=3, model=1): 13, ProcessCoord(pipe=0, data=3, model=2): 14, ProcessCoord(pipe=0, data=3, model=3): 15, ProcessCoord(pipe=0, data=4, model=0): 16, ProcessCoord(pipe=0, data=4, model=1): 17, ProcessCoord(pipe=0, data=4, model=2): 18, ProcessCoord(pipe=0, data=4, model=3): 19, ProcessCoord(pipe=0, data=5, model=0): 20, ProcessCoord(pipe=0, data=5, model=1): 21, ProcessCoord(pipe=0, data=5, model=2): 22, ProcessCoord(pipe=0, data=5, model=3): 23, ProcessCoord(pipe=0, data=6, model=0): 24, ProcessCoord(pipe=0, data=6, model=1): 25, ProcessCoord(pipe=0, data=6, model=2): 26, ProcessCoord(pipe=0, data=6, model=3): 27, ProcessCoord(pipe=0, data=7, model=0): 28, ProcessCoord(pipe=0, data=7, model=1): 29, ProcessCoord(pipe=0, data=7, model=2): 30, ProcessCoord(pipe=0, data=7, model=3): 31, ProcessCoord(pipe=1, data=0, model=0): 32, ProcessCoord(pipe=1, data=0, model=1): 33, ProcessCoord(pipe=1, data=0, model=2): 34, ProcessCoord(pipe=1, data=0, model=3): 35, ProcessCoord(pipe=1, data=1, model=0): 36, ProcessCoord(pipe=1, data=1, model=1): 37, ProcessCoord(pipe=1, data=1, model=2): 38, ProcessCoord(pipe=1, data=1, model=3): 39, ProcessCoord(pipe=1, data=2, model=0): 40, ProcessCoord(pipe=1, data=2, model=1): 41, ProcessCoord(pipe=1, data=2, model=2): 42, ProcessCoord(pipe=1, data=2, model=3): 43, ProcessCoord(pipe=1, data=3, model=0): 44, ProcessCoord(pipe=1, data=3, model=1): 45, ProcessCoord(pipe=1, data=3, model=2): 46, ProcessCoord(pipe=1, data=3, model=3): 47, ProcessCoord(pipe=1, data=4, model=0): 48, ProcessCoord(pipe=1, data=4, model=1): 49, ProcessCoord(pipe=1, data=4, model=2): 50, ProcessCoord(pipe=1, data=4, model=3): 51, ProcessCoord(pipe=1, data=5, model=0): 52, ProcessCoord(pipe=1, data=5, model=1): 53, ProcessCoord(pipe=1, data=5, model=2): 54, ProcessCoord(pipe=1, data=5, model=3): 55, ProcessCoord(pipe=1, data=6, model=0): 56, ProcessCoord(pipe=1, data=6, model=1): 57, ProcessCoord(pipe=1, data=6, model=2): 58, ProcessCoord(pipe=1, data=6, model=3): 59, ProcessCoord(pipe=1, data=7, model=0): 60, ProcessCoord(pipe=1, data=7, model=1): 61, ProcessCoord(pipe=1, data=7, model=2): 62, ProcessCoord(pipe=1, data=7, model=3): 63, ProcessCoord(pipe=2, data=0, model=0): 64, ProcessCoord(pipe=2, data=0, model=1): 65, ProcessCoord(pipe=2, data=0, model=2): 66, ProcessCoord(pipe=2, data=0, model=3): 67, ProcessCoord(pipe=2, data=1, model=0): 68, ProcessCoord(pipe=2, data=1, model=1): 69, ProcessCoord(pipe=2, data=1, model=2): 70, ProcessCoord(pipe=2, data=1, model=3): 71, ProcessCoord(pipe=2, data=2, model=0): 72, ProcessCoord(pipe=2, data=2, model=1): 73, ProcessCoord(pipe=2, data=2, model=2): 74, ProcessCoord(pipe=2, data=2, model=3): 75, ProcessCoord(pipe=2, data=3, model=0): 76, ProcessCoord(pipe=2, data=3, model=1): 77, ProcessCoord(pipe=2, data=3, model=2): 78, ProcessCoord(pipe=2, data=3, model=3): 79, ProcessCoord(pipe=2, data=4, model=0): 80, ProcessCoord(pipe=2, data=4, model=1): 81, ProcessCoord(pipe=2, data=4, model=2): 82, ProcessCoord(pipe=2, data=4, model=3): 83, ProcessCoord(pipe=2, data=5, model=0): 84, ProcessCoord(pipe=2, data=5, model=1): 85, ProcessCoord(pipe=2, data=5, model=2): 86, ProcessCoord(pipe=2, data=5, model=3): 87, ProcessCoord(pipe=2, data=6, model=0): 88, ProcessCoord(pipe=2, data=6, model=1): 89, ProcessCoord(pipe=2, data=6, model=2): 90, ProcessCoord(pipe=2, data=6, model=3): 91, ProcessCoord(pipe=2, data=7, model=0): 92, ProcessCoord(pipe=2, data=7, model=1): 93, ProcessCoord(pipe=2, data=7, model=2): 94, ProcessCoord(pipe=2, data=7, model=3): 95, ProcessCoord(pipe=3, data=0, model=0): 96, ProcessCoord(pipe=3, data=0, model=1): 97, ProcessCoord(pipe=3, data=0, model=2): 98, ProcessCoord(pipe=3, data=0, model=3): 99, ProcessCoord(pipe=3, data=1, model=0): 100, ProcessCoord(pipe=3, data=1, model=1): 101, ProcessCoord(pipe=3, data=1, model=2): 102, ProcessCoord(pipe=3, data=1, model=3): 103, ProcessCoord(pipe=3, data=2, model=0): 104, ProcessCoord(pipe=3, data=2, model=1): 105, ProcessCoord(pipe=3, data=2, model=2): 106, ProcessCoord(pipe=3, data=2, model=3): 107, ProcessCoord(pipe=3, data=3, model=0): 108, ProcessCoord(pipe=3, data=3, model=1): 109, ProcessCoord(pipe=3, data=3, model=2): 110, ProcessCoord(pipe=3, data=3, model=3): 111, ProcessCoord(pipe=3, data=4, model=0): 112, ProcessCoord(pipe=3, data=4, model=1): 113, ProcessCoord(pipe=3, data=4, model=2): 114, ProcessCoord(pipe=3, data=4, model=3): 115, ProcessCoord(pipe=3, data=5, model=0): 116, ProcessCoord(pipe=3, data=5, model=1): 117, ProcessCoord(pipe=3, data=5, model=2): 118, ProcessCoord(pipe=3, data=5, model=3): 119, ProcessCoord(pipe=3, data=6, model=0): 120, ProcessCoord(pipe=3, data=6, model=1): 121, ProcessCoord(pipe=3, data=6, model=2): 122, ProcessCoord(pipe=3, data=6, model=3): 123, ProcessCoord(pipe=3, data=7, model=0): 124, ProcessCoord(pipe=3, data=7, model=1): 125, ProcessCoord(pipe=3, data=7, model=2): 126, ProcessCoord(pipe=3, data=7, model=3): 127, ProcessCoord(pipe=4, data=0, model=0): 128, ProcessCoord(pipe=4, data=0, model=1): 129, ProcessCoord(pipe=4, data=0, model=2): 130, ProcessCoord(pipe=4, data=0, model=3): 131, ProcessCoord(pipe=4, data=1, model=0): 132, ProcessCoord(pipe=4, data=1, model=1): 133, ProcessCoord(pipe=4, data=1, model=2): 134, ProcessCoord(pipe=4, data=1, model=3): 135, ProcessCoord(pipe=4, data=2, model=0): 136, ProcessCoord(pipe=4, data=2, model=1): 137, ProcessCoord(pipe=4, data=2, model=2): 138, ProcessCoord(pipe=4, data=2, model=3): 139, ProcessCoord(pipe=4, data=3, model=0): 140, ProcessCoord(pipe=4, data=3, model=1): 141, ProcessCoord(pipe=4, data=3, model=2): 142, ProcessCoord(pipe=4, data=3, model=3): 143, ProcessCoord(pipe=4, data=4, model=0): 144, ProcessCoord(pipe=4, data=4, model=1): 145, ProcessCoord(pipe=4, data=4, model=2): 146, ProcessCoord(pipe=4, data=4, model=3): 147, ProcessCoord(pipe=4, data=5, model=0): 148, ProcessCoord(pipe=4, data=5, model=1): 149, ProcessCoord(pipe=4, data=5, model=2): 150, ProcessCoord(pipe=4, data=5, model=3): 151, ProcessCoord(pipe=4, data=6, model=0): 152, ProcessCoord(pipe=4, data=6, model=1): 153, ProcessCoord(pipe=4, data=6, model=2): 154, ProcessCoord(pipe=4, data=6, model=3): 155, ProcessCoord(pipe=4, data=7, model=0): 156, ProcessCoord(pipe=4, data=7, model=1): 157, ProcessCoord(pipe=4, data=7, model=2): 158, ProcessCoord(pipe=4, data=7, model=3): 159, ProcessCoord(pipe=5, data=0, model=0): 160, ProcessCoord(pipe=5, data=0, model=1): 161, ProcessCoord(pipe=5, data=0, model=2): 162, ProcessCoord(pipe=5, data=0, model=3): 163, ProcessCoord(pipe=5, data=1, model=0): 164, ProcessCoord(pipe=5, data=1, model=1): 165, ProcessCoord(pipe=5, data=1, model=2): 166, ProcessCoord(pipe=5, data=1, model=3): 167, ProcessCoord(pipe=5, data=2, model=0): 168, ProcessCoord(pipe=5, data=2, model=1): 169, ProcessCoord(pipe=5, data=2, model=2): 170, ProcessCoord(pipe=5, data=2, model=3): 171, ProcessCoord(pipe=5, data=3, model=0): 172, ProcessCoord(pipe=5, data=3, model=1): 173, ProcessCoord(pipe=5, data=3, model=2): 174, ProcessCoord(pipe=5, data=3, model=3): 175, ProcessCoord(pipe=5, data=4, model=0): 176, ProcessCoord(pipe=5, data=4, model=1): 177, ProcessCoord(pipe=5, data=4, model=2): 178, ProcessCoord(pipe=5, data=4, model=3): 179, ProcessCoord(pipe=5, data=5, model=0): 180, ProcessCoord(pipe=5, data=5, model=1): 181, ProcessCoord(pipe=5, data=5, model=2): 182, ProcessCoord(pipe=5, data=5, model=3): 183, ProcessCoord(pipe=5, data=6, model=0): 184, ProcessCoord(pipe=5, data=6, model=1): 185, ProcessCoord(pipe=5, data=6, model=2): 186, ProcessCoord(pipe=5, data=6, model=3): 187, ProcessCoord(pipe=5, data=7, model=0): 188, ProcessCoord(pipe=5, data=7, model=1): 189, ProcessCoord(pipe=5, data=7, model=2): 190, ProcessCoord(pipe=5, data=7, model=3): 191, ProcessCoord(pipe=6, data=0, model=0): 192, ProcessCoord(pipe=6, data=0, model=1): 193, ProcessCoord(pipe=6, data=0, model=2): 194, ProcessCoord(pipe=6, data=0, model=3): 195, ProcessCoord(pipe=6, data=1, model=0): 196, ProcessCoord(pipe=6, data=1, model=1): 197, ProcessCoord(pipe=6, data=1, model=2): 198, ProcessCoord(pipe=6, data=1, model=3): 199, ProcessCoord(pipe=6, data=2, model=0): 200, ProcessCoord(pipe=6, data=2, model=1): 201, ProcessCoord(pipe=6, data=2, model=2): 202, ProcessCoord(pipe=6, data=2, model=3): 203, ProcessCoord(pipe=6, data=3, model=0): 204, ProcessCoord(pipe=6, data=3, model=1): 205, ProcessCoord(pipe=6, data=3, model=2): 206, ProcessCoord(pipe=6, data=3, model=3): 207, ProcessCoord(pipe=6, data=4, model=0): 208, ProcessCoord(pipe=6, data=4, model=1): 209, ProcessCoord(pipe=6, data=4, model=2): 210, ProcessCoord(pipe=6, data=4, model=3): 211, ProcessCoord(pipe=6, data=5, model=0): 212, ProcessCoord(pipe=6, data=5, model=1): 213, ProcessCoord(pipe=6, data=5, model=2): 214, ProcessCoord(pipe=6, data=5, model=3): 215, ProcessCoord(pipe=6, data=6, model=0): 216, ProcessCoord(pipe=6, data=6, model=1): 217, ProcessCoord(pipe=6, data=6, model=2): 218, ProcessCoord(pipe=6, data=6, model=3): 219, ProcessCoord(pipe=6, data=7, model=0): 220, ProcessCoord(pipe=6, data=7, model=1): 221, ProcessCoord(pipe=6, data=7, model=2): 222, ProcessCoord(pipe=6, data=7, model=3): 223, ProcessCoord(pipe=7, data=0, model=0): 224, ProcessCoord(pipe=7, data=0, model=1): 225, ProcessCoord(pipe=7, data=0, model=2): 226, ProcessCoord(pipe=7, data=0, model=3): 227, ProcessCoord(pipe=7, data=1, model=0): 228, ProcessCoord(pipe=7, data=1, model=1): 229, ProcessCoord(pipe=7, data=1, model=2): 230, ProcessCoord(pipe=7, data=1, model=3): 231, ProcessCoord(pipe=7, data=2, model=0): 232, ProcessCoord(pipe=7, data=2, model=1): 233, ProcessCoord(pipe=7, data=2, model=2): 234, ProcessCoord(pipe=7, data=2, model=3): 235, ProcessCoord(pipe=7, data=3, model=0): 236, ProcessCoord(pipe=7, data=3, model=1): 237, ProcessCoord(pipe=7, data=3, model=2): 238, ProcessCoord(pipe=7, data=3, model=3): 239, ProcessCoord(pipe=7, data=4, model=0): 240, ProcessCoord(pipe=7, data=4, model=1): 241, ProcessCoord(pipe=7, data=4, model=2): 242, ProcessCoord(pipe=7, data=4, model=3): 243, ProcessCoord(pipe=7, data=5, model=0): 244, ProcessCoord(pipe=7, data=5, model=1): 245, ProcessCoord(pipe=7, data=5, model=2): 246, ProcessCoord(pipe=7, data=5, model=3): 247, ProcessCoord(pipe=7, data=6, model=0): 248, ProcessCoord(pipe=7, data=6, model=1): 249, ProcessCoord(pipe=7, data=6, model=2): 250, ProcessCoord(pipe=7, data=6, model=3): 251, ProcessCoord(pipe=7, data=7, model=0): 252, ProcessCoord(pipe=7, data=7, model=1): 253, ProcessCoord(pipe=7, data=7, model=2): 254, ProcessCoord(pipe=7, data=7, model=3): 255} -[2021-09-24 02:11:50,155] [INFO] [module.py:360:_partition_layers] Partitioning pipeline stages with method type:transformer -stage=0 layers=7 - 0: _to_float16 - 1: EmbeddingPipe - 2: - 3: ParallelTransformerLayerPipe - 4: ParallelTransformerLayerPipe - 5: ParallelTransformerLayerPipe - 6: ParallelTransformerLayerPipe -stage=1 layers=4 - 7: ParallelTransformerLayerPipe - 8: ParallelTransformerLayerPipe - 9: ParallelTransformerLayerPipe - 10: ParallelTransformerLayerPipe -stage=2 layers=4 - 11: ParallelTransformerLayerPipe - 12: ParallelTransformerLayerPipe - 13: ParallelTransformerLayerPipe - 14: ParallelTransformerLayerPipe -stage=3 layers=4 - 15: ParallelTransformerLayerPipe - 16: ParallelTransformerLayerPipe - 17: ParallelTransformerLayerPipe - 18: ParallelTransformerLayerPipe -stage=4 layers=4 - 19: ParallelTransformerLayerPipe - 20: ParallelTransformerLayerPipe - 21: ParallelTransformerLayerPipe - 22: ParallelTransformerLayerPipe -stage=5 layers=4 - 23: ParallelTransformerLayerPipe - 24: ParallelTransformerLayerPipe - 25: ParallelTransformerLayerPipe - 26: ParallelTransformerLayerPipe -stage=6 layers=4 - 27: ParallelTransformerLayerPipe - 28: ParallelTransformerLayerPipe - 29: ParallelTransformerLayerPipe - 30: ParallelTransformerLayerPipe -stage=7 layers=8 - 31: ParallelTransformerLayerPipe - 32: ParallelTransformerLayerPipe - 33: ParallelTransformerLayerPipe - 34: ParallelTransformerLayerPipe - 35: - 36: MixedFusedLayerNorm - 37: EmbeddingPipe - 38: float16_to_fp32 - loss: CrossEntropy - > number of parameters on (tensor, pipeline) model parallel rank (0, 3): 1745293312 - > number of parameters on (tensor, pipeline) model parallel rank (1, 1): 1745293312 - > number of parameters on (tensor, pipeline) model parallel rank (3, 1): 1745293312 - > number of parameters on (tensor, pipeline) model parallel rank (0, 1): 1745293312 - > number of parameters on (tensor, pipeline) model parallel rank (2, 1): 1745293312 - > number of parameters on (tensor, pipeline) model parallel rank (3, 3): 1745293312 - > number of parameters on (tensor, pipeline) model parallel rank (1, 3): 1745293312 - > number of parameters on (tensor, pipeline) model parallel rank (2, 3): 1745293312 - > number of parameters on (tensor, pipeline) model parallel rank (3, 6): 1745293312 - > number of parameters on (tensor, pipeline) model parallel rank (0, 6): 1745293312 - > number of parameters on (tensor, pipeline) model parallel rank (1, 6): 1745293312 - > number of parameters on (tensor, pipeline) model parallel rank (2, 6): 1745293312 - > number of parameters on (tensor, pipeline) model parallel rank (2, 2): 1745293312 - > number of parameters on (tensor, pipeline) model parallel rank (0, 2): 1745293312 - > number of parameters on (tensor, pipeline) model parallel rank (1, 2): 1745293312 - > number of parameters on (tensor, pipeline) model parallel rank (3, 2): 1745293312 - > number of parameters on (tensor, pipeline) model parallel rank (0, 5): 1745293312 - > number of parameters on (tensor, pipeline) model parallel rank (1, 5): 1745293312 - > number of parameters on (tensor, pipeline) model parallel rank (3, 5): 1745293312 - > number of parameters on (tensor, pipeline) model parallel rank (2, 5): 1745293312 - > number of parameters on (tensor, pipeline) model parallel rank (0, 4): 1745293312 - > number of parameters on (tensor, pipeline) model parallel rank (2, 4): 1745293312 - > number of parameters on (tensor, pipeline) model parallel rank (1, 4): 1745293312 - > number of parameters on (tensor, pipeline) model parallel rank (3, 4): 1745293312 - > number of parameters on (tensor, pipeline) model parallel rank (2, 7): 1986498560 - > number of parameters on (tensor, pipeline) model parallel rank (1, 7): 1986498560 - > number of parameters on (tensor, pipeline) model parallel rank (0, 7): 1986498560 - > number of parameters on (tensor, pipeline) model parallel rank (3, 7): 1986498560 - > number of parameters on (tensor, pipeline) model parallel rank (2, 0): 1986465792 - > number of parameters on (tensor, pipeline) model parallel rank (3, 0): 1986465792 - > number of parameters on (tensor, pipeline) model parallel rank (1, 0): 1986465792 -[2021-09-24 02:11:51,439] [INFO] [utils.py:680:see_memory_usage] After Building Model -[2021-09-24 02:11:51,440] [INFO] [utils.py:681:see_memory_usage] MA 3.77 GB Max_MA 3.79 GB CA 3.79 GB Max_CA 4 GB -[2021-09-24 02:11:51,441] [INFO] [utils.py:689:see_memory_usage] CPU Virtual Memory: used = 37.96 GB, percent = 20.3% - > number of parameters on (tensor, pipeline) model parallel rank (0, 0): 1986465792 -setting training iterations to 159576 -> learning rate decay style: cosine -DeepSpeed is enabled. -[2021-09-24 02:11:51,495] [INFO] [logging.py:68:log_dist] [Rank 0] DeepSpeed info: version=0.4.2+bc17042, git-hash=bc17042, git-branch=big-science -[2021-09-24 02:11:51,606] [INFO] [engine.py:179:__init__] DeepSpeed Flops Profiler Enabled: False -[2021-09-24 02:11:51,606] [INFO] [engine.py:736:_configure_optimizer] Removing param_group that has no 'params' in the client Optimizer -[2021-09-24 02:11:51,606] [INFO] [engine.py:741:_configure_optimizer] Using client Optimizer as basic optimizer -[2021-09-24 02:11:51,606] [INFO] [engine.py:750:_configure_optimizer] DeepSpeed Basic Optimizer = FusedAdam -[2021-09-24 02:11:51,607] [INFO] [utils.py:43:is_zero_supported_optimizer] Checking ZeRO support for optimizer=FusedAdam type= -[2021-09-24 02:11:51,607] [INFO] [logging.py:68:log_dist] [Rank 0] Creating fp16 ZeRO stage 1 optimizer -[2021-09-24 02:11:51,607] [INFO] [stage2.py:106:__init__] Reduce bucket size 500000000 -[2021-09-24 02:11:51,607] [INFO] [stage2.py:107:__init__] Allgather bucket size 500000000 -[2021-09-24 02:11:51,607] [INFO] [stage2.py:108:__init__] CPU Offload: False -[2021-09-24 02:11:51,607] [INFO] [stage2.py:109:__init__] Round robin gradient partitioning: False -[2021-09-24 02:11:56,299] [INFO] [stage2.py:419:__init__] optimizer state initialized -[2021-09-24 02:11:56,299] [INFO] [logging.py:68:log_dist] [Rank 0] DeepSpeed Final Optimizer = FusedAdam -[2021-09-24 02:11:56,299] [INFO] [engine.py:553:_configure_lr_scheduler] DeepSpeed using client LR scheduler -[2021-09-24 02:11:56,299] [INFO] [logging.py:68:log_dist] [Rank 0] DeepSpeed LR Scheduler = -[2021-09-24 02:11:56,300] [INFO] [logging.py:68:log_dist] [Rank 0] step=0, skipped=0, lr=[0.0, 0.0], mom=[(0.9, 0.999), (0.9, 0.999)] -[2021-09-24 02:11:56,300] [INFO] [config.py:900:print] DeepSpeedEngine configuration: -[2021-09-24 02:11:56,300] [INFO] [config.py:904:print] activation_checkpointing_config { - "partition_activations": false, - "contiguous_memory_optimization": false, - "cpu_checkpointing": false, - "number_checkpoints": null, - "synchronize_checkpoint_boundary": false, - "profile": false -} -[2021-09-24 02:11:56,300] [INFO] [config.py:904:print] aio_config ................... {'block_size': 1048576, 'queue_depth': 8, 'thread_count': 1, 'single_submit': False, 'overlap_events': True} -[2021-09-24 02:11:56,300] [INFO] [config.py:904:print] allreduce_always_fp32 ........ False -[2021-09-24 02:11:56,300] [INFO] [config.py:904:print] amp_enabled .................. False -[2021-09-24 02:11:56,300] [INFO] [config.py:904:print] amp_params ................... False -[2021-09-24 02:11:56,300] [INFO] [config.py:904:print] checkpoint_tag_validation_enabled True -[2021-09-24 02:11:56,300] [INFO] [config.py:904:print] checkpoint_tag_validation_fail False -[2021-09-24 02:11:56,300] [INFO] [config.py:904:print] disable_allgather ............ False -[2021-09-24 02:11:56,300] [INFO] [config.py:904:print] dump_state ................... False -[2021-09-24 02:11:56,300] [INFO] [config.py:904:print] dynamic_loss_scale_args ...... {'init_scale': 4096, 'scale_window': 500, 'delayed_shift': 2, 'min_scale': 1} -[2021-09-24 02:11:56,300] [INFO] [config.py:904:print] eigenvalue_enabled ........... False -[2021-09-24 02:11:56,300] [INFO] [config.py:904:print] eigenvalue_gas_boundary_resolution 1 -[2021-09-24 02:11:56,300] [INFO] [config.py:904:print] eigenvalue_layer_name ........ bert.encoder.layer -[2021-09-24 02:11:56,300] [INFO] [config.py:904:print] eigenvalue_layer_num ......... 0 -[2021-09-24 02:11:56,300] [INFO] [config.py:904:print] eigenvalue_max_iter .......... 100 -[2021-09-24 02:11:56,300] [INFO] [config.py:904:print] eigenvalue_stability ......... 1e-06 -[2021-09-24 02:11:56,300] [INFO] [config.py:904:print] eigenvalue_tol ............... 0.01 -[2021-09-24 02:11:56,300] [INFO] [config.py:904:print] eigenvalue_verbose ........... False -[2021-09-24 02:11:56,300] [INFO] [config.py:904:print] elasticity_enabled ........... False -[2021-09-24 02:11:56,301] [INFO] [config.py:904:print] flops_profiler_config ........ { - "enabled": false, - "profile_step": 1, - "module_depth": -1, - "top_modules": 1, - "detailed": true, - "output_file": null -} -[2021-09-24 02:11:56,301] [INFO] [config.py:904:print] fp16_enabled ................. True -[2021-09-24 02:11:56,301] [INFO] [config.py:904:print] fp16_mixed_quantize .......... False -[2021-09-24 02:11:56,301] [INFO] [config.py:904:print] global_rank .................. 0 -[2021-09-24 02:11:56,301] [INFO] [config.py:904:print] gradient_accumulation_steps .. 256 -[2021-09-24 02:11:56,301] [INFO] [config.py:904:print] gradient_clipping ............ 1.0 -[2021-09-24 02:11:56,301] [INFO] [config.py:904:print] gradient_predivide_factor .... 1.0 -[2021-09-24 02:11:56,301] [INFO] [config.py:904:print] initial_dynamic_scale ........ 4096 -[2021-09-24 02:11:56,301] [INFO] [config.py:904:print] loss_scale ................... 0 -[2021-09-24 02:11:56,301] [INFO] [config.py:904:print] memory_breakdown ............. False -[2021-09-24 02:11:56,301] [INFO] [config.py:904:print] optimizer_legacy_fusion ...... False -[2021-09-24 02:11:56,301] [INFO] [config.py:904:print] optimizer_name ............... None -[2021-09-24 02:11:56,301] [INFO] [config.py:904:print] optimizer_params ............. None -[2021-09-24 02:11:56,301] [INFO] [config.py:904:print] pipeline ..................... {'stages': 'auto', 'partition': 'best', 'seed_layers': False, 'activation_checkpoint_interval': 0} -[2021-09-24 02:11:56,301] [INFO] [config.py:904:print] pld_enabled .................. False -[2021-09-24 02:11:56,301] [INFO] [config.py:904:print] pld_params ................... False -[2021-09-24 02:11:56,301] [INFO] [config.py:904:print] prescale_gradients ........... False -[2021-09-24 02:11:56,301] [INFO] [config.py:904:print] quantize_change_rate ......... 0.001 -[2021-09-24 02:11:56,301] [INFO] [config.py:904:print] quantize_groups .............. 1 -[2021-09-24 02:11:56,301] [INFO] [config.py:904:print] quantize_offset .............. 1000 -[2021-09-24 02:11:56,301] [INFO] [config.py:904:print] quantize_period .............. 1000 -[2021-09-24 02:11:56,301] [INFO] [config.py:904:print] quantize_rounding ............ 0 -[2021-09-24 02:11:56,301] [INFO] [config.py:904:print] quantize_start_bits .......... 16 -[2021-09-24 02:11:56,301] [INFO] [config.py:904:print] quantize_target_bits ......... 8 -[2021-09-24 02:11:56,301] [INFO] [config.py:904:print] quantize_training_enabled .... False -[2021-09-24 02:11:56,301] [INFO] [config.py:904:print] quantize_type ................ 0 -[2021-09-24 02:11:56,301] [INFO] [config.py:904:print] quantize_verbose ............. False -[2021-09-24 02:11:56,301] [INFO] [config.py:904:print] scheduler_name ............... None -[2021-09-24 02:11:56,301] [INFO] [config.py:904:print] scheduler_params ............. None -[2021-09-24 02:11:56,301] [INFO] [config.py:904:print] sparse_attention ............. None -[2021-09-24 02:11:56,301] [INFO] [config.py:904:print] sparse_gradients_enabled ..... False -[2021-09-24 02:11:56,301] [INFO] [config.py:904:print] steps_per_print .............. 2000 -[2021-09-24 02:11:56,301] [INFO] [config.py:904:print] tensorboard_enabled .......... False -[2021-09-24 02:11:56,301] [INFO] [config.py:904:print] tensorboard_job_name ......... DeepSpeedJobName -[2021-09-24 02:11:56,301] [INFO] [config.py:904:print] tensorboard_output_path ...... -[2021-09-24 02:11:56,301] [INFO] [config.py:904:print] train_batch_size ............. 2048 -[2021-09-24 02:11:56,301] [INFO] [config.py:904:print] train_micro_batch_size_per_gpu 1 -[2021-09-24 02:11:56,301] [INFO] [config.py:904:print] use_quantizer_kernel ......... False -[2021-09-24 02:11:56,302] [INFO] [config.py:904:print] wall_clock_breakdown ......... False -[2021-09-24 02:11:56,302] [INFO] [config.py:904:print] world_size ................... 8 -[2021-09-24 02:11:56,302] [INFO] [config.py:904:print] zero_allow_untested_optimizer False -[2021-09-24 02:11:56,302] [INFO] [config.py:904:print] zero_config .................. { - "stage": 1, - "contiguous_gradients": false, - "reduce_scatter": true, - "reduce_bucket_size": 5.000000e+08, - "allgather_partitions": true, - "allgather_bucket_size": 5.000000e+08, - "overlap_comm": false, - "load_from_fp32_weights": true, - "elastic_checkpoint": true, - "offload_param": null, - "offload_optimizer": null, - "sub_group_size": 1.000000e+09, - "prefetch_bucket_size": 5.000000e+07, - "param_persistence_threshold": 1.000000e+05, - "max_live_parameters": 1.000000e+09, - "max_reuse_distance": 1.000000e+09, - "gather_fp16_weights_on_model_save": false, - "ignore_unused_parameters": true, - "round_robin_gradients": false, - "legacy_stage1": false -} -[2021-09-24 02:11:56,302] [INFO] [config.py:904:print] zero_enabled ................. True -[2021-09-24 02:11:56,302] [INFO] [config.py:904:print] zero_optimization_stage ...... 1 -[2021-09-24 02:11:56,302] [INFO] [config.py:906:print] json = { - "train_micro_batch_size_per_gpu": 1, - "train_batch_size": 2.048000e+03, - "gradient_clipping": 1.0, - "zero_optimization": { - "stage": 1 - }, - "fp16": { - "enabled": true, - "loss_scale": 0, - "loss_scale_window": 500, - "hysteresis": 2, - "min_loss_scale": 1, - "initial_scale_power": 12 - }, - "steps_per_print": 2.000000e+03, - "wall_clock_breakdown": false -} -[2021-09-24 02:11:56,302] [INFO] [engine.py:76:__init__] CONFIG: micro_batches=256 micro_batch_size=1 -[2021-09-24 02:11:56,606] [INFO] [engine.py:134:__init__] RANK=0 STAGE=0 LAYERS=7 [0, 7) STAGE_PARAMS=1986465792 (1986.466M) TOTAL_PARAMS=57778896896 (57778.897M) UNIQUE_PARAMS=56814206976 (56814.207M) -[2021-09-24 02:11:56,606] [INFO] [engine.py:134:__init__] RANK=3 STAGE=0 LAYERS=7 [0, 7) STAGE_PARAMS=1986465792 (1986.466M) TOTAL_PARAMS=57778896896 (57778.897M) UNIQUE_PARAMS=56814206976 (56814.207M) -[2021-09-24 02:11:56,606] [INFO] [engine.py:134:__init__] RANK=1 STAGE=0 LAYERS=7 [0, 7) STAGE_PARAMS=1986465792 (1986.466M) TOTAL_PARAMS=57778896896 (57778.897M) UNIQUE_PARAMS=56814206976 (56814.207M) -[2021-09-24 02:11:56,606] [INFO] [engine.py:134:__init__] RANK=2 STAGE=0 LAYERS=7 [0, 7) STAGE_PARAMS=1986465792 (1986.466M) TOTAL_PARAMS=57778896896 (57778.897M) UNIQUE_PARAMS=56814206976 (56814.207M) -[2021-09-24 02:11:56,606] [INFO] [engine.py:134:__init__] RANK=131 STAGE=4 LAYERS=4 [19, 23) STAGE_PARAMS=1745293312 (1745.293M) TOTAL_PARAMS=57778896896 (57778.897M) UNIQUE_PARAMS=56814206976 (56814.207M) -[2021-09-24 02:11:56,606] [INFO] [engine.py:134:__init__] RANK=128 STAGE=4 LAYERS=4 [19, 23) STAGE_PARAMS=1745293312 (1745.293M) TOTAL_PARAMS=57778896896 (57778.897M) UNIQUE_PARAMS=56814206976 (56814.207M) -[2021-09-24 02:11:56,606] [INFO] [engine.py:134:__init__] RANK=129 STAGE=4 LAYERS=4 [19, 23) STAGE_PARAMS=1745293312 (1745.293M) TOTAL_PARAMS=57778896896 (57778.897M) UNIQUE_PARAMS=56814206976 (56814.207M) -[2021-09-24 02:11:56,606] [INFO] [engine.py:134:__init__] RANK=194 STAGE=6 LAYERS=4 [27, 31) STAGE_PARAMS=1745293312 (1745.293M) TOTAL_PARAMS=57778896896 (57778.897M) UNIQUE_PARAMS=56814206976 (56814.207M) -[2021-09-24 02:11:56,606] [INFO] [engine.py:134:__init__] RANK=193 STAGE=6 LAYERS=4 [27, 31) STAGE_PARAMS=1745293312 (1745.293M) TOTAL_PARAMS=57778896896 (57778.897M) UNIQUE_PARAMS=56814206976 (56814.207M) -[2021-09-24 02:11:56,606] [INFO] [engine.py:134:__init__] RANK=192 STAGE=6 LAYERS=4 [27, 31) STAGE_PARAMS=1745293312 (1745.293M) TOTAL_PARAMS=57778896896 (57778.897M) UNIQUE_PARAMS=56814206976 (56814.207M) -[2021-09-24 02:11:56,606] [INFO] [engine.py:134:__init__] RANK=195 STAGE=6 LAYERS=4 [27, 31) STAGE_PARAMS=1745293312 (1745.293M) TOTAL_PARAMS=57778896896 (57778.897M) UNIQUE_PARAMS=56814206976 (56814.207M) -[2021-09-24 02:11:56,606] [INFO] [engine.py:134:__init__] RANK=64 STAGE=2 LAYERS=4 [11, 15) STAGE_PARAMS=1745293312 (1745.293M) TOTAL_PARAMS=57778896896 (57778.897M) UNIQUE_PARAMS=56814206976 (56814.207M) -[2021-09-24 02:11:56,606] [INFO] [engine.py:134:__init__] RANK=66 STAGE=2 LAYERS=4 [11, 15) STAGE_PARAMS=1745293312 (1745.293M) TOTAL_PARAMS=57778896896 (57778.897M) UNIQUE_PARAMS=56814206976 (56814.207M) -[2021-09-24 02:11:56,606] [INFO] [engine.py:134:__init__] RANK=65 STAGE=2 LAYERS=4 [11, 15) STAGE_PARAMS=1745293312 (1745.293M) TOTAL_PARAMS=57778896896 (57778.897M) UNIQUE_PARAMS=56814206976 (56814.207M) -[2021-09-24 02:11:56,606] [INFO] [engine.py:134:__init__] RANK=67 STAGE=2 LAYERS=4 [11, 15) STAGE_PARAMS=1745293312 (1745.293M) TOTAL_PARAMS=57778896896 (57778.897M) UNIQUE_PARAMS=56814206976 (56814.207M) -[2021-09-24 02:11:56,606] [INFO] [engine.py:134:__init__] RANK=32 STAGE=1 LAYERS=4 [7, 11) STAGE_PARAMS=1745293312 (1745.293M) TOTAL_PARAMS=57778896896 (57778.897M) UNIQUE_PARAMS=56814206976 (56814.207M) -[2021-09-24 02:11:56,606] [INFO] [engine.py:134:__init__] RANK=33 STAGE=1 LAYERS=4 [7, 11) STAGE_PARAMS=1745293312 (1745.293M) TOTAL_PARAMS=57778896896 (57778.897M) UNIQUE_PARAMS=56814206976 (56814.207M) -[2021-09-24 02:11:56,606] [INFO] [engine.py:134:__init__] RANK=35 STAGE=1 LAYERS=4 [7, 11) STAGE_PARAMS=1745293312 (1745.293M) TOTAL_PARAMS=57778896896 (57778.897M) UNIQUE_PARAMS=56814206976 (56814.207M) -[2021-09-24 02:11:56,606] [INFO] [engine.py:134:__init__] RANK=130 STAGE=4 LAYERS=4 [19, 23) STAGE_PARAMS=1745293312 (1745.293M) TOTAL_PARAMS=57778896896 (57778.897M) UNIQUE_PARAMS=56814206976 (56814.207M) -[2021-09-24 02:11:56,606] [INFO] [engine.py:134:__init__] RANK=97 STAGE=3 LAYERS=4 [15, 19) STAGE_PARAMS=1745293312 (1745.293M) TOTAL_PARAMS=57778896896 (57778.897M) UNIQUE_PARAMS=56814206976 (56814.207M) -[2021-09-24 02:11:56,606] [INFO] [engine.py:134:__init__] RANK=96 STAGE=3 LAYERS=4 [15, 19) STAGE_PARAMS=1745293312 (1745.293M) TOTAL_PARAMS=57778896896 (57778.897M) UNIQUE_PARAMS=56814206976 (56814.207M) -[2021-09-24 02:11:56,606] [INFO] [engine.py:134:__init__] RANK=99 STAGE=3 LAYERS=4 [15, 19) STAGE_PARAMS=1745293312 (1745.293M) TOTAL_PARAMS=57778896896 (57778.897M) UNIQUE_PARAMS=56814206976 (56814.207M) -[2021-09-24 02:11:56,606] [INFO] [engine.py:134:__init__] RANK=98 STAGE=3 LAYERS=4 [15, 19) STAGE_PARAMS=1745293312 (1745.293M) TOTAL_PARAMS=57778896896 (57778.897M) UNIQUE_PARAMS=56814206976 (56814.207M) -[2021-09-24 02:11:56,606] [INFO] [engine.py:134:__init__] RANK=224 STAGE=7 LAYERS=8 [31, 39) STAGE_PARAMS=1986498560 (1986.499M) TOTAL_PARAMS=57778896896 (57778.897M) UNIQUE_PARAMS=56814206976 (56814.207M) -[2021-09-24 02:11:56,606] [INFO] [engine.py:134:__init__] RANK=225 STAGE=7 LAYERS=8 [31, 39) STAGE_PARAMS=1986498560 (1986.499M) TOTAL_PARAMS=57778896896 (57778.897M) UNIQUE_PARAMS=56814206976 (56814.207M) -[2021-09-24 02:11:56,606] [INFO] [engine.py:134:__init__] RANK=226 STAGE=7 LAYERS=8 [31, 39) STAGE_PARAMS=1986498560 (1986.499M) TOTAL_PARAMS=57778896896 (57778.897M) UNIQUE_PARAMS=56814206976 (56814.207M) -[2021-09-24 02:11:56,606] [INFO] [engine.py:134:__init__] RANK=227 STAGE=7 LAYERS=8 [31, 39) STAGE_PARAMS=1986498560 (1986.499M) TOTAL_PARAMS=57778896896 (57778.897M) UNIQUE_PARAMS=56814206976 (56814.207M) -[2021-09-24 02:11:56,606] [INFO] [engine.py:134:__init__] RANK=160 STAGE=5 LAYERS=4 [23, 27) STAGE_PARAMS=1745293312 (1745.293M) TOTAL_PARAMS=57778896896 (57778.897M) UNIQUE_PARAMS=56814206976 (56814.207M) -[2021-09-24 02:11:56,606] [INFO] [engine.py:134:__init__] RANK=163 STAGE=5 LAYERS=4 [23, 27) STAGE_PARAMS=1745293312 (1745.293M) TOTAL_PARAMS=57778896896 (57778.897M) UNIQUE_PARAMS=56814206976 (56814.207M) -[2021-09-24 02:11:56,606] [INFO] [engine.py:134:__init__] RANK=161 STAGE=5 LAYERS=4 [23, 27) STAGE_PARAMS=1745293312 (1745.293M) TOTAL_PARAMS=57778896896 (57778.897M) UNIQUE_PARAMS=56814206976 (56814.207M) -[2021-09-24 02:11:56,606] [INFO] [engine.py:134:__init__] RANK=162 STAGE=5 LAYERS=4 [23, 27) STAGE_PARAMS=1745293312 (1745.293M) TOTAL_PARAMS=57778896896 (57778.897M) UNIQUE_PARAMS=56814206976 (56814.207M) -[2021-09-24 02:11:56,606] [INFO] [engine.py:134:__init__] RANK=34 STAGE=1 LAYERS=4 [7, 11) STAGE_PARAMS=1745293312 (1745.293M) TOTAL_PARAMS=57778896896 (57778.897M) UNIQUE_PARAMS=56814206976 (56814.207M) -[2021-09-24 02:11:56,752] [WARNING] [engine.py:1744:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-09-24 02:11:56,752] [WARNING] [engine.py:1744:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-09-24 02:11:56,752] [WARNING] [engine.py:1744:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-09-24 02:11:56,752] [WARNING] [engine.py:1744:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-09-24 02:11:56,752] [WARNING] [engine.py:1744:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-09-24 02:11:56,752] [WARNING] [engine.py:1744:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-09-24 02:11:56,752] [WARNING] [engine.py:1744:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-09-24 02:11:56,752] [WARNING] [engine.py:1744:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-09-24 02:11:56,752] [WARNING] [engine.py:1744:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-09-24 02:11:56,752] [WARNING] [engine.py:1744:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-09-24 02:11:56,752] [WARNING] [engine.py:1744:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -WARNING: could not find the metadata file /gpfsscratch/rech/six/commun/checkpoints/tr8-104B/checkpoints - will not load any checkpoints and will start from random -[2021-09-24 02:11:56,752] [WARNING] [engine.py:1744:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-09-24 02:11:56,752] [WARNING] [engine.py:1744:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-09-24 02:11:56,752] [WARNING] [engine.py:1744:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-09-24 02:11:56,752] [WARNING] [engine.py:1744:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-09-24 02:11:56,752] [WARNING] [engine.py:1744:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-09-24 02:11:56,752] [WARNING] [engine.py:1744:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-09-24 02:11:56,752] [WARNING] [engine.py:1744:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-09-24 02:11:56,752] [WARNING] [engine.py:1744:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-09-24 02:11:56,752] [WARNING] [engine.py:1744:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-09-24 02:11:56,752] [WARNING] [engine.py:1744:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-09-24 02:11:56,752] [WARNING] [engine.py:1744:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-09-24 02:11:56,752] [WARNING] [engine.py:1744:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-09-24 02:11:56,752] [WARNING] [engine.py:1744:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-09-24 02:11:56,752] [WARNING] [engine.py:1744:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-09-24 02:11:56,752] [WARNING] [engine.py:1744:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-09-24 02:11:56,752] [WARNING] [engine.py:1744:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-09-24 02:11:56,752] [WARNING] [engine.py:1744:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-09-24 02:11:56,752] [WARNING] [engine.py:1744:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-09-24 02:11:56,752] [WARNING] [engine.py:1744:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-09-24 02:11:56,752] [WARNING] [engine.py:1744:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-09-24 02:11:56,752] [WARNING] [engine.py:1744:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-09-24 02:11:56,752] [WARNING] [engine.py:1744:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-09-24 02:11:56,752] [WARNING] [engine.py:1744:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-09-24 02:11:56,752] [WARNING] [engine.py:1744:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-09-24 02:11:56,752] [WARNING] [engine.py:1744:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-09-24 02:11:56,752] [WARNING] [engine.py:1744:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-09-24 02:11:56,752] [WARNING] [engine.py:1744:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-09-24 02:11:56,752] [WARNING] [engine.py:1744:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-09-24 02:11:56,752] [WARNING] [engine.py:1744:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-09-24 02:11:56,752] [WARNING] [engine.py:1744:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-09-24 02:11:56,752] [WARNING] [engine.py:1744:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-09-24 02:11:56,752] [WARNING] [engine.py:1744:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-09-24 02:11:56,752] [WARNING] [engine.py:1744:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-09-24 02:11:56,752] [WARNING] [engine.py:1744:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-09-24 02:11:56,752] [WARNING] [engine.py:1744:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-09-24 02:11:56,752] [WARNING] [engine.py:1744:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-09-24 02:11:56,752] [WARNING] [engine.py:1744:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-09-24 02:11:56,752] [WARNING] [engine.py:1744:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-09-24 02:11:56,752] [WARNING] [engine.py:1744:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-09-24 02:11:56,752] [WARNING] [engine.py:1744:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-09-24 02:11:56,752] [WARNING] [engine.py:1744:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-09-24 02:11:56,752] [WARNING] [engine.py:1744:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-09-24 02:11:56,752] [WARNING] [engine.py:1744:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-09-24 02:11:56,752] [WARNING] [engine.py:1744:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-09-24 02:11:56,752] [WARNING] [engine.py:1744:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-09-24 02:11:56,752] [WARNING] [engine.py:1744:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-09-24 02:11:56,752] [WARNING] [engine.py:1744:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-09-24 02:11:56,752] [WARNING] [engine.py:1744:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-09-24 02:11:56,752] [WARNING] [engine.py:1744:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-09-24 02:11:56,752] [WARNING] [engine.py:1744:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-09-24 02:11:56,752] [WARNING] [engine.py:1744:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-09-24 02:11:56,752] [WARNING] [engine.py:1744:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-09-24 02:11:56,752] [WARNING] [engine.py:1744:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-09-24 02:11:56,752] [WARNING] [engine.py:1744:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-09-24 02:11:56,752] [WARNING] [engine.py:1744:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-09-24 02:11:56,752] [WARNING] [engine.py:1744:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-09-24 02:11:56,752] [WARNING] [engine.py:1744:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-09-24 02:11:56,752] [WARNING] [engine.py:1744:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-09-24 02:11:56,752] [WARNING] [engine.py:1744:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-09-24 02:11:56,752] [WARNING] [engine.py:1744:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-09-24 02:11:56,752] [WARNING] [engine.py:1744:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-09-24 02:11:56,752] [WARNING] [engine.py:1744:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-09-24 02:11:56,752] [WARNING] [engine.py:1744:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-09-24 02:11:56,752] [WARNING] [engine.py:1744:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-09-24 02:11:56,752] [WARNING] [engine.py:1744:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-09-24 02:11:56,752] [WARNING] [engine.py:1744:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-09-24 02:11:56,752] [WARNING] [engine.py:1744:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-09-24 02:11:56,752] [WARNING] [engine.py:1744:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-09-24 02:11:56,752] [WARNING] [engine.py:1744:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-09-24 02:11:56,752] [WARNING] [engine.py:1744:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-09-24 02:11:56,752] [WARNING] [engine.py:1744:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-09-24 02:11:56,752] [WARNING] [engine.py:1744:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-09-24 02:11:56,752] [WARNING] [engine.py:1744:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-09-24 02:11:56,752] [WARNING] [engine.py:1744:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-09-24 02:11:56,752] [WARNING] [engine.py:1744:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-09-24 02:11:56,752] [WARNING] [engine.py:1744:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-09-24 02:11:56,752] [WARNING] [engine.py:1744:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-09-24 02:11:56,752] [WARNING] [engine.py:1744:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-09-24 02:11:56,752] [WARNING] [engine.py:1744:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-09-24 02:11:56,752] [WARNING] [engine.py:1744:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-09-24 02:11:56,752] [WARNING] [engine.py:1744:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-09-24 02:11:56,752] [WARNING] [engine.py:1744:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-09-24 02:11:56,752] [WARNING] [engine.py:1744:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-09-24 02:11:56,752] [WARNING] [engine.py:1744:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-09-24 02:11:56,752] [WARNING] [engine.py:1744:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-09-24 02:11:56,752] [WARNING] [engine.py:1744:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-09-24 02:11:56,752] [WARNING] [engine.py:1744:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-09-24 02:11:56,752] [WARNING] [engine.py:1744:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-09-24 02:11:56,752] [WARNING] [engine.py:1744:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-09-24 02:11:56,752] [WARNING] [engine.py:1744:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-09-24 02:11:56,752] [WARNING] [engine.py:1744:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-09-24 02:11:56,752] [WARNING] [engine.py:1744:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-09-24 02:11:56,752] [WARNING] [engine.py:1744:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-09-24 02:11:56,752] [WARNING] [engine.py:1744:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-09-24 02:11:56,752] [WARNING] [engine.py:1744:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-09-24 02:11:56,752] [WARNING] [engine.py:1744:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-09-24 02:11:56,752] [WARNING] [engine.py:1744:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-09-24 02:11:56,752] [WARNING] [engine.py:1744:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-09-24 02:11:56,752] [WARNING] [engine.py:1744:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-09-24 02:11:56,752] [WARNING] [engine.py:1744:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-09-24 02:11:56,752] [WARNING] [engine.py:1744:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-09-24 02:11:56,752] [WARNING] [engine.py:1744:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-09-24 02:11:56,752] [WARNING] [engine.py:1744:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-09-24 02:11:56,752] [WARNING] [engine.py:1744:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-09-24 02:11:56,752] [WARNING] [engine.py:1744:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-09-24 02:11:56,752] [WARNING] [engine.py:1744:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-09-24 02:11:56,752] [WARNING] [engine.py:1744:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-09-24 02:11:56,752] [WARNING] [engine.py:1744:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-09-24 02:11:56,752] [WARNING] [engine.py:1744:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-09-24 02:11:56,752] [WARNING] [engine.py:1744:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-09-24 02:11:56,752] [WARNING] [engine.py:1744:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-09-24 02:11:56,752] [WARNING] [engine.py:1744:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-09-24 02:11:56,752] [WARNING] [engine.py:1744:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-09-24 02:11:56,752] [WARNING] [engine.py:1744:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-09-24 02:11:56,752] [WARNING] [engine.py:1744:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-09-24 02:11:56,752] [WARNING] [engine.py:1744:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-09-24 02:11:56,752] [WARNING] [engine.py:1744:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-09-24 02:11:56,752] [WARNING] [engine.py:1744:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-09-24 02:11:56,752] [WARNING] [engine.py:1744:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-09-24 02:11:56,752] [WARNING] [engine.py:1744:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-09-24 02:11:56,752] [WARNING] [engine.py:1744:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-09-24 02:11:56,752] [WARNING] [engine.py:1744:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-09-24 02:11:56,752] [WARNING] [engine.py:1744:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-09-24 02:11:56,752] [WARNING] [engine.py:1744:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-09-24 02:11:56,752] [WARNING] [engine.py:1744:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-09-24 02:11:56,752] [WARNING] [engine.py:1744:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-09-24 02:11:56,752] [WARNING] [engine.py:1744:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-09-24 02:11:56,752] [WARNING] [engine.py:1744:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-09-24 02:11:56,752] [WARNING] [engine.py:1744:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-09-24 02:11:56,752] [WARNING] [engine.py:1744:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-09-24 02:11:56,752] [WARNING] [engine.py:1744:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-09-24 02:11:56,752] [WARNING] [engine.py:1744:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-09-24 02:11:56,752] [WARNING] [engine.py:1744:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-09-24 02:11:56,752] [WARNING] [engine.py:1744:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-09-24 02:11:56,752] [WARNING] [engine.py:1744:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-09-24 02:11:56,752] [WARNING] [engine.py:1744:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-09-24 02:11:56,752] [WARNING] [engine.py:1744:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-09-24 02:11:56,752] [WARNING] [engine.py:1744:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-09-24 02:11:56,752] [WARNING] [engine.py:1744:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-09-24 02:11:56,752] [WARNING] [engine.py:1744:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-09-24 02:11:56,752] [WARNING] [engine.py:1744:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-09-24 02:11:56,752] [WARNING] [engine.py:1744:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-09-24 02:11:56,752] [WARNING] [engine.py:1744:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-09-24 02:11:56,752] [WARNING] [engine.py:1744:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-09-24 02:11:56,752] [WARNING] [engine.py:1744:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-09-24 02:11:56,752] [WARNING] [engine.py:1744:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-09-24 02:11:56,752] [WARNING] [engine.py:1744:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-09-24 02:11:56,752] [WARNING] [engine.py:1744:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-09-24 02:11:56,752] [WARNING] [engine.py:1744:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-09-24 02:11:56,752] [WARNING] [engine.py:1744:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-09-24 02:11:56,752] [WARNING] [engine.py:1744:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-09-24 02:11:56,752] [WARNING] [engine.py:1744:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-09-24 02:11:56,752] [WARNING] [engine.py:1744:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-09-24 02:11:56,752] [WARNING] [engine.py:1744:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-09-24 02:11:56,752] [WARNING] [engine.py:1744:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-09-24 02:11:56,752] [WARNING] [engine.py:1744:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-09-24 02:11:56,752] [WARNING] [engine.py:1744:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-09-24 02:11:56,752] [WARNING] [engine.py:1744:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-09-24 02:11:56,752] [WARNING] [engine.py:1744:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-09-24 02:11:56,752] [WARNING] [engine.py:1744:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-09-24 02:11:56,752] [WARNING] [engine.py:1744:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-09-24 02:11:56,752] [WARNING] [engine.py:1744:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-09-24 02:11:56,752] [WARNING] [engine.py:1744:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-09-24 02:11:56,752] [WARNING] [engine.py:1744:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-09-24 02:11:56,752] [WARNING] [engine.py:1744:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-09-24 02:11:56,752] [WARNING] [engine.py:1744:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-09-24 02:11:56,752] [WARNING] [engine.py:1744:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-09-24 02:11:56,752] [WARNING] [engine.py:1744:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-09-24 02:11:56,752] [WARNING] [engine.py:1744:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-09-24 02:11:56,752] [WARNING] [engine.py:1744:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-09-24 02:11:56,752] [WARNING] [engine.py:1744:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-09-24 02:11:56,752] [WARNING] [engine.py:1744:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-09-24 02:11:56,752] [WARNING] [engine.py:1744:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-09-24 02:11:56,752] [WARNING] [engine.py:1744:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-09-24 02:11:56,752] [WARNING] [engine.py:1744:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-09-24 02:11:56,752] [WARNING] [engine.py:1744:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-09-24 02:11:56,752] [WARNING] [engine.py:1744:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-09-24 02:11:56,752] [WARNING] [engine.py:1744:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-09-24 02:11:56,752] [WARNING] [engine.py:1744:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-09-24 02:11:56,752] [WARNING] [engine.py:1744:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-09-24 02:11:56,752] [WARNING] [engine.py:1744:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-09-24 02:11:56,752] [WARNING] [engine.py:1744:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-09-24 02:11:56,752] [WARNING] [engine.py:1744:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-09-24 02:11:56,752] [WARNING] [engine.py:1744:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-09-24 02:11:56,752] [WARNING] [engine.py:1744:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-09-24 02:11:56,753] [WARNING] [engine.py:1744:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-09-24 02:11:56,753] [WARNING] [engine.py:1744:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-09-24 02:11:56,753] [WARNING] [engine.py:1744:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-09-24 02:11:56,753] [WARNING] [engine.py:1744:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-09-24 02:11:56,753] [WARNING] [engine.py:1744:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-09-24 02:11:56,753] [WARNING] [engine.py:1744:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-09-24 02:11:56,753] [WARNING] [engine.py:1744:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-09-24 02:11:56,753] [WARNING] [engine.py:1744:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-09-24 02:11:56,753] [WARNING] [engine.py:1744:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-09-24 02:11:56,753] [WARNING] [engine.py:1744:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-09-24 02:11:56,753] [WARNING] [engine.py:1744:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-09-24 02:11:56,753] [WARNING] [engine.py:1744:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-09-24 02:11:56,753] [WARNING] [engine.py:1744:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-09-24 02:11:56,753] [WARNING] [engine.py:1744:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-09-24 02:11:56,753] [WARNING] [engine.py:1744:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-09-24 02:11:56,753] [WARNING] [engine.py:1744:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-09-24 02:11:56,753] [WARNING] [engine.py:1744:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-09-24 02:11:56,753] [WARNING] [engine.py:1744:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-09-24 02:11:56,753] [WARNING] [engine.py:1744:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-09-24 02:11:56,753] [WARNING] [engine.py:1744:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-09-24 02:11:56,753] [WARNING] [engine.py:1744:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-09-24 02:11:56,753] [WARNING] [engine.py:1744:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-09-24 02:11:56,753] [WARNING] [engine.py:1744:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-09-24 02:11:56,753] [WARNING] [engine.py:1744:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-09-24 02:11:56,753] [WARNING] [engine.py:1744:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-09-24 02:11:56,753] [WARNING] [engine.py:1744:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-09-24 02:11:56,753] [WARNING] [engine.py:1744:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-09-24 02:11:56,753] [WARNING] [engine.py:1744:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-09-24 02:11:56,753] [WARNING] [engine.py:1744:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-09-24 02:11:56,753] [WARNING] [engine.py:1744:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-09-24 02:11:56,753] [WARNING] [engine.py:1744:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-09-24 02:11:56,753] [WARNING] [engine.py:1744:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-09-24 02:11:56,753] [WARNING] [engine.py:1744:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-09-24 02:11:56,753] [WARNING] [engine.py:1744:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-09-24 02:11:56,753] [WARNING] [engine.py:1744:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-09-24 02:11:56,753] [WARNING] [engine.py:1744:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-09-24 02:11:56,753] [WARNING] [engine.py:1744:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-09-24 02:11:56,753] [WARNING] [engine.py:1744:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-09-24 02:11:56,753] [WARNING] [engine.py:1744:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-09-24 02:11:56,753] [WARNING] [engine.py:1744:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-09-24 02:11:56,753] [WARNING] [engine.py:1744:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-09-24 02:11:56,753] [WARNING] [engine.py:1744:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-09-24 02:11:56,753] [WARNING] [engine.py:1744:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-09-24 02:11:56,753] [WARNING] [engine.py:1744:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-09-24 02:11:56,753] [WARNING] [engine.py:1744:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-09-24 02:11:56,753] [WARNING] [engine.py:1744:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-09-24 02:11:56,753] [WARNING] [engine.py:1744:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-09-24 02:11:56,753] [WARNING] [engine.py:1744:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-09-24 02:11:56,753] [WARNING] [engine.py:1744:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-09-24 02:11:56,753] [WARNING] [engine.py:1744:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-09-24 02:11:56,753] [WARNING] [engine.py:1744:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-09-24 02:11:56,753] [WARNING] [engine.py:1744:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-09-24 02:11:56,753] [WARNING] [engine.py:1744:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-09-24 02:11:56,753] [WARNING] [engine.py:1744:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-09-24 02:11:56,753] [WARNING] [engine.py:1744:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-09-24 02:11:56,753] [WARNING] [engine.py:1744:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-09-24 02:11:56,753] [WARNING] [engine.py:1744:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-09-24 02:11:56,753] [WARNING] [engine.py:1744:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-09-24 02:11:56,753] [WARNING] [engine.py:1744:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -[2021-09-24 02:11:56,753] [WARNING] [engine.py:1744:load_checkpoint] Unable to find latest file at /gpfsscratch/rech/six/commun/checkpoints/tr8-104B/checkpoints/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. -time (ms) | load-checkpoint: 1.91 -[after model, optimizer, and learning rate scheduler are built] datetime: 2021-09-24 02:11:56 -> building train, validation, and test datasets ... - > datasets target sizes (minimum size): - train: 300000000 - validation: 1638400 - test: 10240 -> building train, validation, and test datasets for GPT ... - > building dataset index ... - reading sizes... - reading pointers... - reading document index... - creating numpy buffer of mmap... - creating memory view of numpy buffer... - > finished creating indexed dataset in 0.214922 seconds - number of documents: 304230423 - > dataset split: - train: - document indices in [0, 288714672) total of 288714672 documents - validation: - document indices in [288714672, 303926193) total of 15211521 documents - test: - document indices in [303926193, 304230423) total of 304230 documents - > loading doc-idx mapping from /gpfswork/rech/six/commun/datasets-custom/oscar-en/meg-gpt2_text_document_train_indexmap_300000000ns_2048sl_42s_doc_idx.npy - > loading sample-idx mapping from /gpfswork/rech/six/commun/datasets-custom/oscar-en/meg-gpt2_text_document_train_indexmap_300000000ns_2048sl_42s_sample_idx.npy - > loading shuffle-idx mapping from /gpfswork/rech/six/commun/datasets-custom/oscar-en/meg-gpt2_text_document_train_indexmap_300000000ns_2048sl_42s_shuffle_idx.npy - loaded indexed file in 0.337 seconds - total number of samples: 394611670 - total number of epochs: 3 - > loading doc-idx mapping from /gpfswork/rech/six/commun/datasets-custom/oscar-en/meg-gpt2_text_document_valid_indexmap_1638400ns_2048sl_42s_doc_idx.npy - > loading sample-idx mapping from /gpfswork/rech/six/commun/datasets-custom/oscar-en/meg-gpt2_text_document_valid_indexmap_1638400ns_2048sl_42s_sample_idx.npy - > loading shuffle-idx mapping from /gpfswork/rech/six/commun/datasets-custom/oscar-en/meg-gpt2_text_document_valid_indexmap_1638400ns_2048sl_42s_shuffle_idx.npy - loaded indexed file in 0.309 seconds - total number of samples: 6927161 - total number of epochs: 1 - > loading doc-idx mapping from /gpfswork/rech/six/commun/datasets-custom/oscar-en/meg-gpt2_text_document_test_indexmap_10240ns_2048sl_42s_doc_idx.npy - > loading sample-idx mapping from /gpfswork/rech/six/commun/datasets-custom/oscar-en/meg-gpt2_text_document_test_indexmap_10240ns_2048sl_42s_sample_idx.npy - > loading shuffle-idx mapping from /gpfswork/rech/six/commun/datasets-custom/oscar-en/meg-gpt2_text_document_test_indexmap_10240ns_2048sl_42s_shuffle_idx.npy - loaded indexed file in 0.060 seconds - total number of samples: 137384 - total number of epochs: 1 -> finished creating GPT datasets ... -[after dataloaders are built] datetime: 2021-09-24 02:12:03 -done with setup ... -training ... -time (ms) | model-and-optimizer-setup: 8062.72 | train/valid/test-data-iterators-setup: 5729.09 -[before the start of training step] datetime: 2021-09-24 02:12:03 -[2021-09-24 02:12:03,365] [INFO] [checkpointing.py:408:forward] Activation Checkpointing Information -[2021-09-24 02:12:03,365] [INFO] [checkpointing.py:409:forward] ----Partition Activations False, CPU CHECKPOINTING False -[2021-09-24 02:12:03,365] [INFO] [checkpointing.py:412:forward] ----contiguous Memory Checkpointing False with 32 total layers -[2021-09-24 02:12:03,365] [INFO] [checkpointing.py:415:forward] ----Synchronization False -[2021-09-24 02:12:03,365] [INFO] [checkpointing.py:416:forward] ----Profiling time in checkpointing False -[Rank 1] (after 1 iterations) memory (MB) | allocated: 6661.611328125 | max allocated: 11742.55810546875 | reserved: 21150.0 | max reserved: 21150.0 -[Rank 33] (after 1 iterations) memory (MB) | allocated: 5861.5498046875 | max allocated: 10450.46337890625 | reserved: 18442.0 | max reserved: 18442.0 -[Rank 65] (after 1 iterations) memory (MB) | allocated: 5861.5498046875 | max allocated: 10450.46337890625 | reserved: 18442.0 | max reserved: 18442.0 -[Rank 97] (after 1 iterations) memory (MB) | allocated: 5861.5498046875 | max allocated: 10450.46337890625 | reserved: 18442.0 | max reserved: 18442.0 -[Rank 225] (after 1 iterations) memory (MB) | allocated: 7107.70751953125 | max allocated: 11884.6845703125 | reserved: 22492.0 | max reserved: 22492.0 -[Rank 129] (after 1 iterations) memory (MB) | allocated: 5861.5498046875 | max allocated: 10450.46337890625 | reserved: 18442.0 | max reserved: 18442.0 -[Rank 193] (after 1 iterations) memory (MB) | allocated: 5861.5498046875 | max allocated: 10450.46337890625 | reserved: 18586.0 | max reserved: 18586.0 -[Rank 161] (after 1 iterations) memory (MB) | allocated: 5861.5498046875 | max allocated: 10450.46337890625 | reserved: 18442.0 | max reserved: 18442.0 -[Rank 2] (after 1 iterations) memory (MB) | allocated: 6661.611328125 | max allocated: 11742.55810546875 | reserved: 21150.0 | max reserved: 21150.0 -[Rank 34] (after 1 iterations) memory (MB) | allocated: 5861.5498046875 | max allocated: 10450.46337890625 | reserved: 18442.0 | max reserved: 18442.0 -[Rank 226] (after 1 iterations) memory (MB) | allocated: 7107.70751953125 | max allocated: 11884.6845703125 | reserved: 21700.0 | max reserved: 21700.0 -[Rank 66] (after 1 iterations) memory (MB) | allocated: 5861.5498046875 | max allocated: 10450.46337890625 | reserved: 18778.0 | max reserved: 18778.0 -[Rank 98] (after 1 iterations) memory (MB) | allocated: 5861.5498046875 | max allocated: 10450.46337890625 | reserved: 18586.0 | max reserved: 18586.0 -[Rank 130] (after 1 iterations) memory (MB) | allocated: 5861.5498046875 | max allocated: 10450.46337890625 | reserved: 18442.0 | max reserved: 18442.0 -[Rank 194] (after 1 iterations) memory (MB) | allocated: 5861.5498046875 | max allocated: 10450.46337890625 | reserved: 18650.0 | max reserved: 18650.0 -[Rank 162] (after 1 iterations) memory (MB) | allocated: 5861.5498046875 | max allocated: 10450.46337890625 | reserved: 18442.0 | max reserved: 18442.0 -[Rank 0] (after 1 iterations) memory (MB) | allocated: 6661.611328125 | max allocated: 11742.55810546875 | reserved: 21470.0 | max reserved: 21470.0 -[Rank 64] (after 1 iterations) memory (MB) | allocated: 5861.5498046875 | max allocated: 10450.46337890625 | reserved: 19252.0 | max reserved: 19252.0 -[Rank 32] (after 1 iterations) memory (MB) | allocated: 5861.5498046875 | max allocated: 10450.46337890625 | reserved: 18868.0 | max reserved: 18868.0 -[Rank 128] (after 1 iterations) memory (MB) | allocated: 5861.5498046875 | max allocated: 10450.46337890625 | reserved: 18868.0 | max reserved: 18868.0 -[Rank 96] (after 1 iterations) memory (MB) | allocated: 5861.5498046875 | max allocated: 10450.46337890625 | reserved: 18868.0 | max reserved: 18868.0 -[Rank 224] (after 1 iterations) memory (MB) | allocated: 7107.70751953125 | max allocated: 11884.6845703125 | reserved: 22492.0 | max reserved: 22492.0 -[Rank 192] (after 1 iterations) memory (MB) | allocated: 5861.5498046875 | max allocated: 10450.46337890625 | reserved: 18868.0 | max reserved: 18868.0 -[Rank 160] (after 1 iterations) memory (MB) | allocated: 5861.5498046875 | max allocated: 10450.46337890625 | reserved: 18868.0 | max reserved: 18868.0 -[Rank 35] (after 1 iterations) memory (MB) | allocated: 5861.5498046875 | max allocated: 10450.46337890625 | reserved: 18442.0 | max reserved: 18442.0 -[Rank 3] (after 1 iterations) memory (MB) | allocated: 6661.611328125 | max allocated: 11742.55810546875 | reserved: 21150.0 | max reserved: 21150.0 -[Rank 67] (after 1 iterations) memory (MB) | allocated: 5861.5498046875 | max allocated: 10450.46337890625 | reserved: 18522.0 | max reserved: 18522.0 -[Rank 99] (after 1 iterations) memory (MB) | allocated: 5861.5498046875 | max allocated: 10450.46337890625 | reserved: 18442.0 | max reserved: 18442.0 -[Rank 131] (after 1 iterations) memory (MB) | allocated: 5861.5498046875 | max allocated: 10450.46337890625 | reserved: 18522.0 | max reserved: 18522.0 -[Rank 227] (after 1 iterations) memory (MB) | allocated: 7107.70751953125 | max allocated: 11884.6845703125 | reserved: 21700.0 | max reserved: 21700.0 -[Rank 195] (after 1 iterations) memory (MB) | allocated: 5861.5498046875 | max allocated: 10450.46337890625 | reserved: 18586.0 | max reserved: 18586.0 -[Rank 163] (after 1 iterations) memory (MB) | allocated: 5861.5498046875 | max allocated: 10450.46337890625 | reserved: 18442.0 | max reserved: 18442.0 - iteration 1/ 159576 | consumed samples: 16 | elapsed time per iteration (ms): 31536.2 | learning rate: 4.438E-09 | global batch size: 16 | lm loss: 1.426722E+01 | loss scale: 4096.0 | grad norm: 1863985.704 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 2/ 159576 | consumed samples: 32 | elapsed time per iteration (ms): 13049.6 | learning rate: 8.876E-09 | global batch size: 16 | lm loss: 1.429125E+01 | loss scale: 4096.0 | grad norm: 1882741.499 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 3/ 159576 | consumed samples: 48 | elapsed time per iteration (ms): 13671.4 | learning rate: 1.331E-08 | global batch size: 16 | lm loss: 1.421026E+01 | loss scale: 4096.0 | grad norm: 1871916.438 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 4/ 159576 | consumed samples: 64 | elapsed time per iteration (ms): 13544.5 | learning rate: 1.775E-08 | global batch size: 16 | lm loss: 1.424627E+01 | loss scale: 4096.0 | grad norm: 1912485.128 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 5/ 159576 | consumed samples: 80 | elapsed time per iteration (ms): 13955.0 | learning rate: 2.219E-08 | global batch size: 16 | lm loss: 1.421161E+01 | loss scale: 4096.0 | grad norm: 1873991.265 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 6/ 159576 | consumed samples: 96 | elapsed time per iteration (ms): 13725.9 | learning rate: 2.663E-08 | global batch size: 16 | lm loss: 1.423833E+01 | loss scale: 4096.0 | grad norm: 1889068.937 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 7/ 159576 | consumed samples: 112 | elapsed time per iteration (ms): 13496.8 | learning rate: 3.107E-08 | global batch size: 16 | lm loss: 1.423929E+01 | loss scale: 4096.0 | grad norm: 1864001.655 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 8/ 159576 | consumed samples: 128 | elapsed time per iteration (ms): 13565.8 | learning rate: 3.550E-08 | global batch size: 16 | lm loss: 1.424760E+01 | loss scale: 4096.0 | grad norm: 1867381.949 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 9/ 159576 | consumed samples: 144 | elapsed time per iteration (ms): 14076.3 | learning rate: 3.994E-08 | global batch size: 16 | lm loss: 1.418199E+01 | loss scale: 4096.0 | grad norm: 1902029.931 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 10/ 159576 | consumed samples: 160 | elapsed time per iteration (ms): 13497.5 | learning rate: 4.438E-08 | global batch size: 16 | lm loss: 1.412427E+01 | loss scale: 4096.0 | grad norm: 1865649.234 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 11/ 159576 | consumed samples: 176 | elapsed time per iteration (ms): 13459.5 | learning rate: 4.882E-08 | global batch size: 16 | lm loss: 1.407386E+01 | loss scale: 4096.0 | grad norm: 1861067.628 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 12/ 159576 | consumed samples: 192 | elapsed time per iteration (ms): 13581.0 | learning rate: 5.325E-08 | global batch size: 16 | lm loss: 1.400436E+01 | loss scale: 4096.0 | grad norm: 1857208.659 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 13/ 159576 | consumed samples: 208 | elapsed time per iteration (ms): 13877.0 | learning rate: 5.769E-08 | global batch size: 16 | lm loss: 1.374212E+01 | loss scale: 4096.0 | grad norm: 1860712.228 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 14/ 159576 | consumed samples: 224 | elapsed time per iteration (ms): 13730.6 | learning rate: 6.213E-08 | global batch size: 16 | lm loss: 1.363158E+01 | loss scale: 4096.0 | grad norm: 1835837.890 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 15/ 159576 | consumed samples: 240 | elapsed time per iteration (ms): 13589.9 | learning rate: 6.657E-08 | global batch size: 16 | lm loss: 1.353429E+01 | loss scale: 4096.0 | grad norm: 1866742.342 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 16/ 159576 | consumed samples: 256 | elapsed time per iteration (ms): 13709.9 | learning rate: 7.101E-08 | global batch size: 16 | lm loss: 1.346230E+01 | loss scale: 4096.0 | grad norm: 1867848.322 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 17/ 159576 | consumed samples: 272 | elapsed time per iteration (ms): 13515.8 | learning rate: 7.544E-08 | global batch size: 16 | lm loss: 1.257517E+01 | loss scale: 4096.0 | grad norm: 1827444.965 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 18/ 159576 | consumed samples: 288 | elapsed time per iteration (ms): 13800.0 | learning rate: 7.988E-08 | global batch size: 16 | lm loss: 1.251998E+01 | loss scale: 4096.0 | grad norm: 2020558.797 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 19/ 159576 | consumed samples: 304 | elapsed time per iteration (ms): 13516.3 | learning rate: 8.432E-08 | global batch size: 16 | lm loss: 1.265157E+01 | loss scale: 4096.0 | grad norm: 2257407.748 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 20/ 159576 | consumed samples: 320 | elapsed time per iteration (ms): 13549.6 | learning rate: 8.876E-08 | global batch size: 16 | lm loss: 1.252521E+01 | loss scale: 4096.0 | grad norm: 2095375.557 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 21/ 159576 | consumed samples: 336 | elapsed time per iteration (ms): 13586.7 | learning rate: 9.320E-08 | global batch size: 16 | lm loss: 1.244903E+01 | loss scale: 4096.0 | grad norm: 2211855.540 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 22/ 159576 | consumed samples: 352 | elapsed time per iteration (ms): 14140.0 | learning rate: 9.763E-08 | global batch size: 16 | lm loss: 1.221426E+01 | loss scale: 4096.0 | grad norm: 2152853.946 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 23/ 159576 | consumed samples: 368 | elapsed time per iteration (ms): 13565.7 | learning rate: 1.021E-07 | global batch size: 16 | lm loss: 1.223387E+01 | loss scale: 4096.0 | grad norm: 2257726.245 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 24/ 159576 | consumed samples: 384 | elapsed time per iteration (ms): 13529.2 | learning rate: 1.065E-07 | global batch size: 16 | lm loss: 1.252795E+01 | loss scale: 4096.0 | grad norm: 2648402.060 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 25/ 159576 | consumed samples: 400 | elapsed time per iteration (ms): 13468.4 | learning rate: 1.109E-07 | global batch size: 16 | lm loss: 1.249682E+01 | loss scale: 4096.0 | grad norm: 2816711.826 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 26/ 159576 | consumed samples: 416 | elapsed time per iteration (ms): 13529.9 | learning rate: 1.154E-07 | global batch size: 16 | lm loss: 1.219784E+01 | loss scale: 4096.0 | grad norm: 2380750.659 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 27/ 159576 | consumed samples: 432 | elapsed time per iteration (ms): 13833.4 | learning rate: 1.198E-07 | global batch size: 16 | lm loss: 1.182601E+01 | loss scale: 4096.0 | grad norm: 2116005.650 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 28/ 159576 | consumed samples: 448 | elapsed time per iteration (ms): 13615.6 | learning rate: 1.243E-07 | global batch size: 16 | lm loss: 1.159655E+01 | loss scale: 4096.0 | grad norm: 1805209.516 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 29/ 159576 | consumed samples: 464 | elapsed time per iteration (ms): 13371.2 | learning rate: 1.287E-07 | global batch size: 16 | lm loss: 1.165552E+01 | loss scale: 4096.0 | grad norm: 1731569.615 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 30/ 159576 | consumed samples: 480 | elapsed time per iteration (ms): 13604.8 | learning rate: 1.331E-07 | global batch size: 16 | lm loss: 1.154380E+01 | loss scale: 4096.0 | grad norm: 1706578.844 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 31/ 159576 | consumed samples: 496 | elapsed time per iteration (ms): 13982.3 | learning rate: 1.376E-07 | global batch size: 16 | lm loss: 1.139362E+01 | loss scale: 4096.0 | grad norm: 1757980.169 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 32/ 159576 | consumed samples: 512 | elapsed time per iteration (ms): 13306.0 | learning rate: 1.420E-07 | global batch size: 16 | lm loss: 1.148209E+01 | loss scale: 4096.0 | grad norm: 1697993.336 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 33/ 159576 | consumed samples: 528 | elapsed time per iteration (ms): 13575.8 | learning rate: 1.464E-07 | global batch size: 16 | lm loss: 1.140995E+01 | loss scale: 4096.0 | grad norm: 1670562.081 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 34/ 159576 | consumed samples: 544 | elapsed time per iteration (ms): 13613.2 | learning rate: 1.509E-07 | global batch size: 16 | lm loss: 1.132776E+01 | loss scale: 4096.0 | grad norm: 1643305.715 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 35/ 159576 | consumed samples: 560 | elapsed time per iteration (ms): 13869.9 | learning rate: 1.553E-07 | global batch size: 16 | lm loss: 1.136237E+01 | loss scale: 4096.0 | grad norm: 1648846.360 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 36/ 159576 | consumed samples: 576 | elapsed time per iteration (ms): 13789.0 | learning rate: 1.598E-07 | global batch size: 16 | lm loss: 1.143323E+01 | loss scale: 4096.0 | grad norm: 1598861.192 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 37/ 159576 | consumed samples: 592 | elapsed time per iteration (ms): 13658.0 | learning rate: 1.642E-07 | global batch size: 16 | lm loss: 1.115875E+01 | loss scale: 4096.0 | grad norm: 1562919.350 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 38/ 159576 | consumed samples: 608 | elapsed time per iteration (ms): 13961.2 | learning rate: 1.686E-07 | global batch size: 16 | lm loss: 1.117768E+01 | loss scale: 4096.0 | grad norm: 1565543.705 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 39/ 159576 | consumed samples: 624 | elapsed time per iteration (ms): 13410.4 | learning rate: 1.731E-07 | global batch size: 16 | lm loss: 1.111340E+01 | loss scale: 4096.0 | grad norm: 1536768.356 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 40/ 159576 | consumed samples: 640 | elapsed time per iteration (ms): 13891.8 | learning rate: 1.775E-07 | global batch size: 16 | lm loss: 1.106657E+01 | loss scale: 4096.0 | grad norm: 1548421.837 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 41/ 159576 | consumed samples: 656 | elapsed time per iteration (ms): 13633.3 | learning rate: 1.820E-07 | global batch size: 16 | lm loss: 1.094995E+01 | loss scale: 4096.0 | grad norm: 1532446.839 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 42/ 159576 | consumed samples: 672 | elapsed time per iteration (ms): 13643.8 | learning rate: 1.864E-07 | global batch size: 16 | lm loss: 1.087856E+01 | loss scale: 4096.0 | grad norm: 1531337.842 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 43/ 159576 | consumed samples: 688 | elapsed time per iteration (ms): 13630.7 | learning rate: 1.908E-07 | global batch size: 16 | lm loss: 1.084412E+01 | loss scale: 4096.0 | grad norm: 1473539.326 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 44/ 159576 | consumed samples: 704 | elapsed time per iteration (ms): 14118.0 | learning rate: 1.953E-07 | global batch size: 16 | lm loss: 1.114596E+01 | loss scale: 4096.0 | grad norm: 1496700.678 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 45/ 159576 | consumed samples: 720 | elapsed time per iteration (ms): 13853.8 | learning rate: 1.997E-07 | global batch size: 16 | lm loss: 1.092829E+01 | loss scale: 4096.0 | grad norm: 1454980.052 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 46/ 159576 | consumed samples: 736 | elapsed time per iteration (ms): 13549.0 | learning rate: 2.041E-07 | global batch size: 16 | lm loss: 1.074461E+01 | loss scale: 4096.0 | grad norm: 1397083.505 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 47/ 159576 | consumed samples: 752 | elapsed time per iteration (ms): 13627.3 | learning rate: 2.086E-07 | global batch size: 16 | lm loss: 1.066580E+01 | loss scale: 4096.0 | grad norm: 1311670.870 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 48/ 159576 | consumed samples: 768 | elapsed time per iteration (ms): 13674.9 | learning rate: 2.130E-07 | global batch size: 16 | lm loss: 1.055744E+01 | loss scale: 4096.0 | grad norm: 1292299.744 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 49/ 159576 | consumed samples: 784 | elapsed time per iteration (ms): 13932.1 | learning rate: 2.175E-07 | global batch size: 16 | lm loss: 1.060610E+01 | loss scale: 4096.0 | grad norm: 1283482.631 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 50/ 159576 | consumed samples: 800 | elapsed time per iteration (ms): 13665.9 | learning rate: 2.219E-07 | global batch size: 16 | lm loss: 1.063007E+01 | loss scale: 4096.0 | grad norm: 1228203.240 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 51/ 159576 | consumed samples: 816 | elapsed time per iteration (ms): 13667.5 | learning rate: 2.263E-07 | global batch size: 16 | lm loss: 1.046357E+01 | loss scale: 4096.0 | grad norm: 1219490.568 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 52/ 159576 | consumed samples: 832 | elapsed time per iteration (ms): 13793.6 | learning rate: 2.308E-07 | global batch size: 16 | lm loss: 1.061804E+01 | loss scale: 4096.0 | grad norm: 1197068.783 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 53/ 159576 | consumed samples: 848 | elapsed time per iteration (ms): 14209.6 | learning rate: 2.352E-07 | global batch size: 16 | lm loss: 1.041930E+01 | loss scale: 4096.0 | grad norm: 1168890.772 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 54/ 159576 | consumed samples: 864 | elapsed time per iteration (ms): 13453.2 | learning rate: 2.396E-07 | global batch size: 16 | lm loss: 1.035855E+01 | loss scale: 4096.0 | grad norm: 1126594.517 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 55/ 159576 | consumed samples: 880 | elapsed time per iteration (ms): 13666.6 | learning rate: 2.441E-07 | global batch size: 16 | lm loss: 1.051081E+01 | loss scale: 4096.0 | grad norm: 1080949.187 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 56/ 159576 | consumed samples: 896 | elapsed time per iteration (ms): 13689.5 | learning rate: 2.485E-07 | global batch size: 16 | lm loss: 1.048364E+01 | loss scale: 4096.0 | grad norm: 1069119.479 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 57/ 159576 | consumed samples: 912 | elapsed time per iteration (ms): 14289.6 | learning rate: 2.530E-07 | global batch size: 16 | lm loss: 1.048154E+01 | loss scale: 4096.0 | grad norm: 1016407.938 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 58/ 159576 | consumed samples: 928 | elapsed time per iteration (ms): 13663.2 | learning rate: 2.574E-07 | global batch size: 16 | lm loss: 1.019213E+01 | loss scale: 4096.0 | grad norm: 982402.590 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 59/ 159576 | consumed samples: 944 | elapsed time per iteration (ms): 13704.5 | learning rate: 2.618E-07 | global batch size: 16 | lm loss: 1.019982E+01 | loss scale: 4096.0 | grad norm: 965254.453 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 60/ 159576 | consumed samples: 960 | elapsed time per iteration (ms): 13846.3 | learning rate: 2.663E-07 | global batch size: 16 | lm loss: 1.021626E+01 | loss scale: 4096.0 | grad norm: 926021.764 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 61/ 159576 | consumed samples: 976 | elapsed time per iteration (ms): 13469.9 | learning rate: 2.707E-07 | global batch size: 16 | lm loss: 1.008368E+01 | loss scale: 4096.0 | grad norm: 911608.476 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 62/ 159576 | consumed samples: 992 | elapsed time per iteration (ms): 13774.9 | learning rate: 2.751E-07 | global batch size: 16 | lm loss: 9.892099E+00 | loss scale: 4096.0 | grad norm: 882114.442 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 63/ 159576 | consumed samples: 1008 | elapsed time per iteration (ms): 13514.1 | learning rate: 2.796E-07 | global batch size: 16 | lm loss: 9.876393E+00 | loss scale: 4096.0 | grad norm: 834416.962 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 64/ 159576 | consumed samples: 1024 | elapsed time per iteration (ms): 13538.5 | learning rate: 2.840E-07 | global batch size: 16 | lm loss: 9.927294E+00 | loss scale: 4096.0 | grad norm: 814691.882 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 65/ 159576 | consumed samples: 1040 | elapsed time per iteration (ms): 13496.5 | learning rate: 2.885E-07 | global batch size: 16 | lm loss: 1.024293E+01 | loss scale: 4096.0 | grad norm: 821175.330 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 66/ 159576 | consumed samples: 1056 | elapsed time per iteration (ms): 14030.7 | learning rate: 2.929E-07 | global batch size: 16 | lm loss: 9.930872E+00 | loss scale: 4096.0 | grad norm: 759629.854 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 67/ 159576 | consumed samples: 1072 | elapsed time per iteration (ms): 13743.1 | learning rate: 2.973E-07 | global batch size: 16 | lm loss: 9.852800E+00 | loss scale: 4096.0 | grad norm: 734440.980 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 68/ 159576 | consumed samples: 1088 | elapsed time per iteration (ms): 13293.2 | learning rate: 3.018E-07 | global batch size: 16 | lm loss: 9.786448E+00 | loss scale: 4096.0 | grad norm: 702591.247 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 69/ 159576 | consumed samples: 1104 | elapsed time per iteration (ms): 13515.6 | learning rate: 3.062E-07 | global batch size: 16 | lm loss: 9.917148E+00 | loss scale: 4096.0 | grad norm: 689937.545 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 70/ 159576 | consumed samples: 1120 | elapsed time per iteration (ms): 13786.0 | learning rate: 3.107E-07 | global batch size: 16 | lm loss: 9.593161E+00 | loss scale: 4096.0 | grad norm: 634541.803 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 71/ 159576 | consumed samples: 1136 | elapsed time per iteration (ms): 13761.6 | learning rate: 3.151E-07 | global batch size: 16 | lm loss: 9.685747E+00 | loss scale: 4096.0 | grad norm: 620089.160 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 72/ 159576 | consumed samples: 1152 | elapsed time per iteration (ms): 13503.1 | learning rate: 3.195E-07 | global batch size: 16 | lm loss: 9.550736E+00 | loss scale: 4096.0 | grad norm: 592735.898 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 73/ 159576 | consumed samples: 1168 | elapsed time per iteration (ms): 13574.6 | learning rate: 3.240E-07 | global batch size: 16 | lm loss: 9.780053E+00 | loss scale: 4096.0 | grad norm: 578902.468 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 74/ 159576 | consumed samples: 1184 | elapsed time per iteration (ms): 13563.6 | learning rate: 3.284E-07 | global batch size: 16 | lm loss: 9.660094E+00 | loss scale: 4096.0 | grad norm: 549632.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 75/ 159576 | consumed samples: 1200 | elapsed time per iteration (ms): 13751.3 | learning rate: 3.328E-07 | global batch size: 16 | lm loss: 9.715110E+00 | loss scale: 4096.0 | grad norm: 523457.012 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 76/ 159576 | consumed samples: 1216 | elapsed time per iteration (ms): 13613.9 | learning rate: 3.373E-07 | global batch size: 16 | lm loss: 9.548697E+00 | loss scale: 4096.0 | grad norm: 559789.568 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 77/ 159576 | consumed samples: 1232 | elapsed time per iteration (ms): 13668.9 | learning rate: 3.417E-07 | global batch size: 16 | lm loss: 9.395579E+00 | loss scale: 4096.0 | grad norm: 516053.141 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 78/ 159576 | consumed samples: 1248 | elapsed time per iteration (ms): 13540.8 | learning rate: 3.462E-07 | global batch size: 16 | lm loss: 9.450207E+00 | loss scale: 4096.0 | grad norm: 491518.990 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 79/ 159576 | consumed samples: 1264 | elapsed time per iteration (ms): 13951.5 | learning rate: 3.506E-07 | global batch size: 16 | lm loss: 9.312221E+00 | loss scale: 4096.0 | grad norm: 445025.682 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 80/ 159576 | consumed samples: 1280 | elapsed time per iteration (ms): 13710.1 | learning rate: 3.550E-07 | global batch size: 16 | lm loss: 9.362122E+00 | loss scale: 4096.0 | grad norm: 498046.459 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 81/ 159576 | consumed samples: 1296 | elapsed time per iteration (ms): 13653.8 | learning rate: 3.595E-07 | global batch size: 16 | lm loss: 9.684261E+00 | loss scale: 4096.0 | grad norm: 460137.704 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 82/ 159576 | consumed samples: 1312 | elapsed time per iteration (ms): 13416.1 | learning rate: 3.639E-07 | global batch size: 16 | lm loss: 9.111031E+00 | loss scale: 4096.0 | grad norm: 462196.098 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 83/ 159576 | consumed samples: 1328 | elapsed time per iteration (ms): 13589.7 | learning rate: 3.683E-07 | global batch size: 16 | lm loss: 9.424231E+00 | loss scale: 4096.0 | grad norm: 387492.278 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 84/ 159576 | consumed samples: 1344 | elapsed time per iteration (ms): 13890.8 | learning rate: 3.728E-07 | global batch size: 16 | lm loss: 9.225885E+00 | loss scale: 4096.0 | grad norm: 477146.862 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 85/ 159576 | consumed samples: 1360 | elapsed time per iteration (ms): 13578.1 | learning rate: 3.772E-07 | global batch size: 16 | lm loss: 9.449253E+00 | loss scale: 4096.0 | grad norm: 498838.088 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 86/ 159576 | consumed samples: 1376 | elapsed time per iteration (ms): 13600.8 | learning rate: 3.817E-07 | global batch size: 16 | lm loss: 9.186915E+00 | loss scale: 4096.0 | grad norm: 359821.133 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 87/ 159576 | consumed samples: 1392 | elapsed time per iteration (ms): 13578.0 | learning rate: 3.861E-07 | global batch size: 16 | lm loss: 9.169426E+00 | loss scale: 4096.0 | grad norm: 336361.334 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 88/ 159576 | consumed samples: 1408 | elapsed time per iteration (ms): 14258.1 | learning rate: 3.905E-07 | global batch size: 16 | lm loss: 9.174639E+00 | loss scale: 4096.0 | grad norm: 513262.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 89/ 159576 | consumed samples: 1424 | elapsed time per iteration (ms): 13350.5 | learning rate: 3.950E-07 | global batch size: 16 | lm loss: 9.322023E+00 | loss scale: 4096.0 | grad norm: 417913.413 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 90/ 159576 | consumed samples: 1440 | elapsed time per iteration (ms): 13582.0 | learning rate: 3.994E-07 | global batch size: 16 | lm loss: 9.319530E+00 | loss scale: 4096.0 | grad norm: 326159.953 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 91/ 159576 | consumed samples: 1456 | elapsed time per iteration (ms): 13577.6 | learning rate: 4.038E-07 | global batch size: 16 | lm loss: 9.305362E+00 | loss scale: 4096.0 | grad norm: 312504.506 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 92/ 159576 | consumed samples: 1472 | elapsed time per iteration (ms): 13979.9 | learning rate: 4.083E-07 | global batch size: 16 | lm loss: 8.797226E+00 | loss scale: 4096.0 | grad norm: 299274.584 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 93/ 159576 | consumed samples: 1488 | elapsed time per iteration (ms): 13685.6 | learning rate: 4.127E-07 | global batch size: 16 | lm loss: 9.470177E+00 | loss scale: 4096.0 | grad norm: 889931.672 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 94/ 159576 | consumed samples: 1504 | elapsed time per iteration (ms): 13625.1 | learning rate: 4.172E-07 | global batch size: 16 | lm loss: 9.601658E+00 | loss scale: 4096.0 | grad norm: 858157.270 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 95/ 159576 | consumed samples: 1520 | elapsed time per iteration (ms): 13713.7 | learning rate: 4.216E-07 | global batch size: 16 | lm loss: 9.093191E+00 | loss scale: 4096.0 | grad norm: 308888.782 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 96/ 159576 | consumed samples: 1536 | elapsed time per iteration (ms): 13441.7 | learning rate: 4.260E-07 | global batch size: 16 | lm loss: 9.258781E+00 | loss scale: 4096.0 | grad norm: 285375.841 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 97/ 159576 | consumed samples: 1552 | elapsed time per iteration (ms): 13952.1 | learning rate: 4.305E-07 | global batch size: 16 | lm loss: 9.267257E+00 | loss scale: 4096.0 | grad norm: 266598.437 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 98/ 159576 | consumed samples: 1568 | elapsed time per iteration (ms): 13570.4 | learning rate: 4.349E-07 | global batch size: 16 | lm loss: 9.302748E+00 | loss scale: 4096.0 | grad norm: 430050.353 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 99/ 159576 | consumed samples: 1584 | elapsed time per iteration (ms): 13655.7 | learning rate: 4.393E-07 | global batch size: 16 | lm loss: 9.206352E+00 | loss scale: 4096.0 | grad norm: 522965.120 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 100/ 159576 | consumed samples: 1600 | elapsed time per iteration (ms): 13606.3 | learning rate: 4.438E-07 | global batch size: 16 | lm loss: 9.212991E+00 | loss scale: 4096.0 | grad norm: 351294.826 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 101/ 159576 | consumed samples: 1616 | elapsed time per iteration (ms): 14021.3 | learning rate: 4.482E-07 | global batch size: 16 | lm loss: 9.392309E+00 | loss scale: 4096.0 | grad norm: 249407.405 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 102/ 159576 | consumed samples: 1632 | elapsed time per iteration (ms): 13722.5 | learning rate: 4.527E-07 | global batch size: 16 | lm loss: 9.173745E+00 | loss scale: 4096.0 | grad norm: 230190.700 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 103/ 159576 | consumed samples: 1648 | elapsed time per iteration (ms): 13481.3 | learning rate: 4.571E-07 | global batch size: 16 | lm loss: 9.060183E+00 | loss scale: 4096.0 | grad norm: 535519.642 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 104/ 159576 | consumed samples: 1664 | elapsed time per iteration (ms): 13573.2 | learning rate: 4.615E-07 | global batch size: 16 | lm loss: 8.820353E+00 | loss scale: 4096.0 | grad norm: 252106.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 105/ 159576 | consumed samples: 1680 | elapsed time per iteration (ms): 13679.8 | learning rate: 4.660E-07 | global batch size: 16 | lm loss: 8.907228E+00 | loss scale: 4096.0 | grad norm: 227304.496 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 106/ 159576 | consumed samples: 1696 | elapsed time per iteration (ms): 13833.6 | learning rate: 4.704E-07 | global batch size: 16 | lm loss: 8.920894E+00 | loss scale: 4096.0 | grad norm: 226622.044 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 107/ 159576 | consumed samples: 1712 | elapsed time per iteration (ms): 13577.9 | learning rate: 4.749E-07 | global batch size: 16 | lm loss: 8.839094E+00 | loss scale: 4096.0 | grad norm: 188033.687 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 108/ 159576 | consumed samples: 1728 | elapsed time per iteration (ms): 13620.7 | learning rate: 4.793E-07 | global batch size: 16 | lm loss: 9.072345E+00 | loss scale: 4096.0 | grad norm: 405511.072 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 109/ 159576 | consumed samples: 1744 | elapsed time per iteration (ms): 13608.5 | learning rate: 4.837E-07 | global batch size: 16 | lm loss: 8.981932E+00 | loss scale: 4096.0 | grad norm: 326365.949 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 110/ 159576 | consumed samples: 1760 | elapsed time per iteration (ms): 13945.7 | learning rate: 4.882E-07 | global batch size: 16 | lm loss: 8.900158E+00 | loss scale: 4096.0 | grad norm: 183771.399 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 111/ 159576 | consumed samples: 1776 | elapsed time per iteration (ms): 13542.6 | learning rate: 4.926E-07 | global batch size: 16 | lm loss: 8.908926E+00 | loss scale: 4096.0 | grad norm: 189581.109 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 112/ 159576 | consumed samples: 1792 | elapsed time per iteration (ms): 13715.6 | learning rate: 4.970E-07 | global batch size: 16 | lm loss: 8.738115E+00 | loss scale: 4096.0 | grad norm: 176974.824 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 113/ 159576 | consumed samples: 1808 | elapsed time per iteration (ms): 13456.9 | learning rate: 5.015E-07 | global batch size: 16 | lm loss: 9.185429E+00 | loss scale: 4096.0 | grad norm: 452577.591 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 114/ 159576 | consumed samples: 1824 | elapsed time per iteration (ms): 14039.5 | learning rate: 5.059E-07 | global batch size: 16 | lm loss: 9.235853E+00 | loss scale: 4096.0 | grad norm: 567475.961 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 115/ 159576 | consumed samples: 1840 | elapsed time per iteration (ms): 13568.6 | learning rate: 5.104E-07 | global batch size: 16 | lm loss: 8.848898E+00 | loss scale: 4096.0 | grad norm: 182062.035 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 116/ 159576 | consumed samples: 1856 | elapsed time per iteration (ms): 13607.1 | learning rate: 5.148E-07 | global batch size: 16 | lm loss: 8.955499E+00 | loss scale: 4096.0 | grad norm: 179172.056 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 117/ 159576 | consumed samples: 1872 | elapsed time per iteration (ms): 13798.7 | learning rate: 5.192E-07 | global batch size: 16 | lm loss: 8.835221E+00 | loss scale: 4096.0 | grad norm: 168846.925 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 118/ 159576 | consumed samples: 1888 | elapsed time per iteration (ms): 13424.3 | learning rate: 5.237E-07 | global batch size: 16 | lm loss: 9.120043E+00 | loss scale: 4096.0 | grad norm: 304218.818 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 119/ 159576 | consumed samples: 1904 | elapsed time per iteration (ms): 13992.7 | learning rate: 5.281E-07 | global batch size: 16 | lm loss: 8.877877E+00 | loss scale: 4096.0 | grad norm: 328004.326 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 120/ 159576 | consumed samples: 1920 | elapsed time per iteration (ms): 13739.9 | learning rate: 5.325E-07 | global batch size: 16 | lm loss: 9.091492E+00 | loss scale: 4096.0 | grad norm: 542667.397 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 121/ 159576 | consumed samples: 1936 | elapsed time per iteration (ms): 13438.9 | learning rate: 5.370E-07 | global batch size: 16 | lm loss: 8.963889E+00 | loss scale: 4096.0 | grad norm: 173633.066 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 122/ 159576 | consumed samples: 1952 | elapsed time per iteration (ms): 13659.9 | learning rate: 5.414E-07 | global batch size: 16 | lm loss: 8.973601E+00 | loss scale: 4096.0 | grad norm: 154883.483 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 123/ 159576 | consumed samples: 1968 | elapsed time per iteration (ms): 14034.9 | learning rate: 5.459E-07 | global batch size: 16 | lm loss: 8.932154E+00 | loss scale: 4096.0 | grad norm: 191305.172 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 124/ 159576 | consumed samples: 1984 | elapsed time per iteration (ms): 13642.6 | learning rate: 5.503E-07 | global batch size: 16 | lm loss: 8.718765E+00 | loss scale: 4096.0 | grad norm: 141927.967 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 125/ 159576 | consumed samples: 2000 | elapsed time per iteration (ms): 13607.3 | learning rate: 5.547E-07 | global batch size: 16 | lm loss: 9.022717E+00 | loss scale: 4096.0 | grad norm: 530230.902 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 126/ 159576 | consumed samples: 2016 | elapsed time per iteration (ms): 13623.2 | learning rate: 5.592E-07 | global batch size: 16 | lm loss: 9.160154E+00 | loss scale: 4096.0 | grad norm: 525377.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 127/ 159576 | consumed samples: 2032 | elapsed time per iteration (ms): 13944.5 | learning rate: 5.636E-07 | global batch size: 16 | lm loss: 8.602621E+00 | loss scale: 4096.0 | grad norm: 180832.122 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 128/ 159576 | consumed samples: 2048 | elapsed time per iteration (ms): 13652.1 | learning rate: 5.680E-07 | global batch size: 16 | lm loss: 8.848473E+00 | loss scale: 4096.0 | grad norm: 159006.909 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 129/ 159576 | consumed samples: 2064 | elapsed time per iteration (ms): 13619.4 | learning rate: 5.725E-07 | global batch size: 16 | lm loss: 8.697285E+00 | loss scale: 4096.0 | grad norm: 166208.955 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 130/ 159576 | consumed samples: 2080 | elapsed time per iteration (ms): 13649.8 | learning rate: 5.769E-07 | global batch size: 16 | lm loss: 8.738346E+00 | loss scale: 4096.0 | grad norm: 142582.672 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 131/ 159576 | consumed samples: 2096 | elapsed time per iteration (ms): 13648.8 | learning rate: 5.814E-07 | global batch size: 16 | lm loss: 8.628532E+00 | loss scale: 4096.0 | grad norm: 119745.012 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 132/ 159576 | consumed samples: 2112 | elapsed time per iteration (ms): 13855.7 | learning rate: 5.858E-07 | global batch size: 16 | lm loss: 8.681314E+00 | loss scale: 4096.0 | grad norm: 238581.530 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 133/ 159576 | consumed samples: 2128 | elapsed time per iteration (ms): 13614.3 | learning rate: 5.902E-07 | global batch size: 16 | lm loss: 8.853155E+00 | loss scale: 4096.0 | grad norm: 190597.797 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 134/ 159576 | consumed samples: 2144 | elapsed time per iteration (ms): 13742.8 | learning rate: 5.947E-07 | global batch size: 16 | lm loss: 8.840850E+00 | loss scale: 4096.0 | grad norm: 157001.058 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 135/ 159576 | consumed samples: 2160 | elapsed time per iteration (ms): 13481.4 | learning rate: 5.991E-07 | global batch size: 16 | lm loss: 8.721090E+00 | loss scale: 4096.0 | grad norm: 120761.062 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 136/ 159576 | consumed samples: 2176 | elapsed time per iteration (ms): 14037.0 | learning rate: 6.036E-07 | global batch size: 16 | lm loss: 8.786610E+00 | loss scale: 4096.0 | grad norm: 109166.988 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 137/ 159576 | consumed samples: 2192 | elapsed time per iteration (ms): 13631.2 | learning rate: 6.080E-07 | global batch size: 16 | lm loss: 8.825349E+00 | loss scale: 4096.0 | grad norm: 393039.207 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 138/ 159576 | consumed samples: 2208 | elapsed time per iteration (ms): 13698.2 | learning rate: 6.124E-07 | global batch size: 16 | lm loss: 8.681873E+00 | loss scale: 4096.0 | grad norm: 210924.024 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 139/ 159576 | consumed samples: 2224 | elapsed time per iteration (ms): 13641.8 | learning rate: 6.169E-07 | global batch size: 16 | lm loss: 8.758416E+00 | loss scale: 4096.0 | grad norm: 111138.195 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 140/ 159576 | consumed samples: 2240 | elapsed time per iteration (ms): 13650.3 | learning rate: 6.213E-07 | global batch size: 16 | lm loss: 8.646829E+00 | loss scale: 4096.0 | grad norm: 115663.463 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 141/ 159576 | consumed samples: 2256 | elapsed time per iteration (ms): 14097.3 | learning rate: 6.257E-07 | global batch size: 16 | lm loss: 8.653087E+00 | loss scale: 4096.0 | grad norm: 142126.653 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 142/ 159576 | consumed samples: 2272 | elapsed time per iteration (ms): 13468.2 | learning rate: 6.302E-07 | global batch size: 16 | lm loss: 8.647311E+00 | loss scale: 4096.0 | grad norm: 163914.852 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 143/ 159576 | consumed samples: 2288 | elapsed time per iteration (ms): 13544.7 | learning rate: 6.346E-07 | global batch size: 16 | lm loss: 8.564240E+00 | loss scale: 4096.0 | grad norm: 159952.939 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 144/ 159576 | consumed samples: 2304 | elapsed time per iteration (ms): 13642.1 | learning rate: 6.391E-07 | global batch size: 16 | lm loss: 8.789017E+00 | loss scale: 4096.0 | grad norm: 169255.588 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 145/ 159576 | consumed samples: 2320 | elapsed time per iteration (ms): 14181.4 | learning rate: 6.435E-07 | global batch size: 16 | lm loss: 8.811962E+00 | loss scale: 4096.0 | grad norm: 127162.884 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 146/ 159576 | consumed samples: 2336 | elapsed time per iteration (ms): 13492.3 | learning rate: 6.479E-07 | global batch size: 16 | lm loss: 8.774818E+00 | loss scale: 4096.0 | grad norm: 110483.274 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 147/ 159576 | consumed samples: 2352 | elapsed time per iteration (ms): 13671.3 | learning rate: 6.524E-07 | global batch size: 16 | lm loss: 8.753700E+00 | loss scale: 4096.0 | grad norm: 128181.260 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 148/ 159576 | consumed samples: 2368 | elapsed time per iteration (ms): 13675.0 | learning rate: 6.568E-07 | global batch size: 16 | lm loss: 8.742964E+00 | loss scale: 4096.0 | grad norm: 140698.611 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 149/ 159576 | consumed samples: 2384 | elapsed time per iteration (ms): 14154.8 | learning rate: 6.612E-07 | global batch size: 16 | lm loss: 8.705631E+00 | loss scale: 4096.0 | grad norm: 284561.708 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 150/ 159576 | consumed samples: 2400 | elapsed time per iteration (ms): 13301.3 | learning rate: 6.657E-07 | global batch size: 16 | lm loss: 8.639321E+00 | loss scale: 4096.0 | grad norm: 158457.469 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 151/ 159576 | consumed samples: 2416 | elapsed time per iteration (ms): 13553.4 | learning rate: 6.701E-07 | global batch size: 16 | lm loss: 8.747204E+00 | loss scale: 4096.0 | grad norm: 217035.827 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 152/ 159576 | consumed samples: 2432 | elapsed time per iteration (ms): 13577.6 | learning rate: 6.746E-07 | global batch size: 16 | lm loss: 8.711011E+00 | loss scale: 4096.0 | grad norm: 170149.010 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 153/ 159576 | consumed samples: 2448 | elapsed time per iteration (ms): 13522.0 | learning rate: 6.790E-07 | global batch size: 16 | lm loss: 8.717499E+00 | loss scale: 4096.0 | grad norm: 103133.580 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 154/ 159576 | consumed samples: 2464 | elapsed time per iteration (ms): 13883.8 | learning rate: 6.834E-07 | global batch size: 16 | lm loss: 8.587013E+00 | loss scale: 4096.0 | grad norm: 99765.078 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 155/ 159576 | consumed samples: 2480 | elapsed time per iteration (ms): 13554.0 | learning rate: 6.879E-07 | global batch size: 16 | lm loss: 8.698885E+00 | loss scale: 4096.0 | grad norm: 282680.223 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 156/ 159576 | consumed samples: 2496 | elapsed time per iteration (ms): 13692.4 | learning rate: 6.923E-07 | global batch size: 16 | lm loss: 9.289864E+00 | loss scale: 4096.0 | grad norm: 609278.865 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 157/ 159576 | consumed samples: 2512 | elapsed time per iteration (ms): 13306.0 | learning rate: 6.967E-07 | global batch size: 16 | lm loss: 8.803203E+00 | loss scale: 4096.0 | grad norm: 221182.708 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) -[2021-09-24 02:48:11] PULSE: tr8-104B is waiting to be scheduled (1159457_[1-10%1] on 'gpu_p13' partition) -[2021-09-24 02:48:11] PULSE: tr8-104B is scheduled to start in 18:26:36 (at 2021-09-24T21:14:48) (1161605 on 'gpu_p13' partition) -[2021-09-24 02:48:11] PULSE: tr8-104B is running for 37:09 since 2021-09-24T02:11:02 (1161730 on 'gpu_p13' partition (r6i4n7,r6i5n[7-8],r6i6n[0,6,8],r6i7n3,r7i2n[2,4-5],r7i3n2,r7i6n[2-4],r7i7n[3,7-8],r8i0n[2-3,5-8],r8i1n[0,2-4],r8i3n[0-2],r8i5n[3-4],r8i7n[3-6,8],r9i0n[0-2],r9i1n[0-3],r9i2n[3-5,8],r9i3n[0-1,7-8],r9i4n[0-2],r9i5n[3-8],r9i6n[0,7-8]) - iteration 158/ 159576 | consumed samples: 2528 | elapsed time per iteration (ms): 13873.2 | learning rate: 7.012E-07 | global batch size: 16 | lm loss: 8.628306E+00 | loss scale: 4096.0 | grad norm: 200507.061 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 159/ 159576 | consumed samples: 2544 | elapsed time per iteration (ms): 13466.2 | learning rate: 7.056E-07 | global batch size: 16 | lm loss: 8.632781E+00 | loss scale: 4096.0 | grad norm: 103638.607 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 160/ 159576 | consumed samples: 2560 | elapsed time per iteration (ms): 13494.3 | learning rate: 7.101E-07 | global batch size: 16 | lm loss: 8.596104E+00 | loss scale: 4096.0 | grad norm: 92105.558 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 161/ 159576 | consumed samples: 2576 | elapsed time per iteration (ms): 13517.5 | learning rate: 7.145E-07 | global batch size: 16 | lm loss: 8.408714E+00 | loss scale: 4096.0 | grad norm: 78965.627 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 162/ 159576 | consumed samples: 2592 | elapsed time per iteration (ms): 13540.1 | learning rate: 7.189E-07 | global batch size: 16 | lm loss: 9.134837E+00 | loss scale: 4096.0 | grad norm: 524949.559 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 163/ 159576 | consumed samples: 2608 | elapsed time per iteration (ms): 13879.1 | learning rate: 7.234E-07 | global batch size: 16 | lm loss: 8.601346E+00 | loss scale: 4096.0 | grad norm: 206465.490 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 164/ 159576 | consumed samples: 2624 | elapsed time per iteration (ms): 13564.5 | learning rate: 7.278E-07 | global batch size: 16 | lm loss: 8.734079E+00 | loss scale: 4096.0 | grad norm: 159985.137 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 165/ 159576 | consumed samples: 2640 | elapsed time per iteration (ms): 13607.4 | learning rate: 7.322E-07 | global batch size: 16 | lm loss: 8.629238E+00 | loss scale: 4096.0 | grad norm: 89678.564 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 166/ 159576 | consumed samples: 2656 | elapsed time per iteration (ms): 13687.7 | learning rate: 7.367E-07 | global batch size: 16 | lm loss: 8.753635E+00 | loss scale: 4096.0 | grad norm: 108761.613 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 167/ 159576 | consumed samples: 2672 | elapsed time per iteration (ms): 14101.4 | learning rate: 7.411E-07 | global batch size: 16 | lm loss: 8.647141E+00 | loss scale: 4096.0 | grad norm: 78778.670 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 168/ 159576 | consumed samples: 2688 | elapsed time per iteration (ms): 13827.5 | learning rate: 7.456E-07 | global batch size: 16 | lm loss: 8.838135E+00 | loss scale: 4096.0 | grad norm: 301360.421 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 169/ 159576 | consumed samples: 2704 | elapsed time per iteration (ms): 13776.5 | learning rate: 7.500E-07 | global batch size: 16 | lm loss: 8.865972E+00 | loss scale: 4096.0 | grad norm: 230779.992 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 170/ 159576 | consumed samples: 2720 | elapsed time per iteration (ms): 13667.3 | learning rate: 7.544E-07 | global batch size: 16 | lm loss: 8.716210E+00 | loss scale: 4096.0 | grad norm: 133087.211 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 171/ 159576 | consumed samples: 2736 | elapsed time per iteration (ms): 13974.1 | learning rate: 7.589E-07 | global batch size: 16 | lm loss: 8.726005E+00 | loss scale: 4096.0 | grad norm: 112595.632 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 172/ 159576 | consumed samples: 2752 | elapsed time per iteration (ms): 13644.3 | learning rate: 7.633E-07 | global batch size: 16 | lm loss: 8.704071E+00 | loss scale: 4096.0 | grad norm: 92111.748 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 173/ 159576 | consumed samples: 2768 | elapsed time per iteration (ms): 13586.4 | learning rate: 7.678E-07 | global batch size: 16 | lm loss: 8.823001E+00 | loss scale: 4096.0 | grad norm: 93068.020 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 174/ 159576 | consumed samples: 2784 | elapsed time per iteration (ms): 13629.3 | learning rate: 7.722E-07 | global batch size: 16 | lm loss: 8.521597E+00 | loss scale: 4096.0 | grad norm: 79887.666 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 175/ 159576 | consumed samples: 2800 | elapsed time per iteration (ms): 13647.0 | learning rate: 7.766E-07 | global batch size: 16 | lm loss: 9.370278E+00 | loss scale: 4096.0 | grad norm: 576797.121 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 176/ 159576 | consumed samples: 2816 | elapsed time per iteration (ms): 13993.8 | learning rate: 7.811E-07 | global batch size: 16 | lm loss: 9.255205E+00 | loss scale: 4096.0 | grad norm: 337846.372 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 177/ 159576 | consumed samples: 2832 | elapsed time per iteration (ms): 13778.2 | learning rate: 7.855E-07 | global batch size: 16 | lm loss: 9.038449E+00 | loss scale: 4096.0 | grad norm: 339366.601 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 178/ 159576 | consumed samples: 2848 | elapsed time per iteration (ms): 13515.3 | learning rate: 7.899E-07 | global batch size: 16 | lm loss: 8.771539E+00 | loss scale: 4096.0 | grad norm: 216761.610 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 179/ 159576 | consumed samples: 2864 | elapsed time per iteration (ms): 13657.6 | learning rate: 7.944E-07 | global batch size: 16 | lm loss: 8.718536E+00 | loss scale: 4096.0 | grad norm: 103470.129 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 180/ 159576 | consumed samples: 2880 | elapsed time per iteration (ms): 14095.5 | learning rate: 7.988E-07 | global batch size: 16 | lm loss: 8.968449E+00 | loss scale: 4096.0 | grad norm: 88300.652 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 181/ 159576 | consumed samples: 2896 | elapsed time per iteration (ms): 13570.0 | learning rate: 8.033E-07 | global batch size: 16 | lm loss: 8.743597E+00 | loss scale: 4096.0 | grad norm: 73637.354 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 182/ 159576 | consumed samples: 2912 | elapsed time per iteration (ms): 13631.2 | learning rate: 8.077E-07 | global batch size: 16 | lm loss: 8.650385E+00 | loss scale: 4096.0 | grad norm: 170612.165 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 183/ 159576 | consumed samples: 2928 | elapsed time per iteration (ms): 13666.1 | learning rate: 8.121E-07 | global batch size: 16 | lm loss: 8.764441E+00 | loss scale: 4096.0 | grad norm: 157032.537 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 184/ 159576 | consumed samples: 2944 | elapsed time per iteration (ms): 14033.7 | learning rate: 8.166E-07 | global batch size: 16 | lm loss: 8.546231E+00 | loss scale: 4096.0 | grad norm: 68818.140 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 185/ 159576 | consumed samples: 2960 | elapsed time per iteration (ms): 13755.2 | learning rate: 8.210E-07 | global batch size: 16 | lm loss: 8.605597E+00 | loss scale: 4096.0 | grad norm: 245599.472 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 186/ 159576 | consumed samples: 2976 | elapsed time per iteration (ms): 13693.9 | learning rate: 8.254E-07 | global batch size: 16 | lm loss: 8.735710E+00 | loss scale: 4096.0 | grad norm: 193090.020 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 187/ 159576 | consumed samples: 2992 | elapsed time per iteration (ms): 13666.7 | learning rate: 8.299E-07 | global batch size: 16 | lm loss: 8.800616E+00 | loss scale: 4096.0 | grad norm: 121643.211 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 188/ 159576 | consumed samples: 3008 | elapsed time per iteration (ms): 13617.1 | learning rate: 8.343E-07 | global batch size: 16 | lm loss: 8.450140E+00 | loss scale: 4096.0 | grad norm: 91010.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 189/ 159576 | consumed samples: 3024 | elapsed time per iteration (ms): 14107.4 | learning rate: 8.388E-07 | global batch size: 16 | lm loss: 8.680673E+00 | loss scale: 4096.0 | grad norm: 171815.380 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 190/ 159576 | consumed samples: 3040 | elapsed time per iteration (ms): 13662.7 | learning rate: 8.432E-07 | global batch size: 16 | lm loss: 8.619300E+00 | loss scale: 4096.0 | grad norm: 80825.030 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 191/ 159576 | consumed samples: 3056 | elapsed time per iteration (ms): 13715.7 | learning rate: 8.476E-07 | global batch size: 16 | lm loss: 8.438683E+00 | loss scale: 4096.0 | grad norm: 68255.978 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 192/ 159576 | consumed samples: 3072 | elapsed time per iteration (ms): 13611.5 | learning rate: 8.521E-07 | global batch size: 16 | lm loss: 8.685935E+00 | loss scale: 4096.0 | grad norm: 100702.747 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 193/ 159576 | consumed samples: 3088 | elapsed time per iteration (ms): 14234.2 | learning rate: 8.565E-07 | global batch size: 16 | lm loss: 8.644808E+00 | loss scale: 4096.0 | grad norm: 193299.432 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 194/ 159576 | consumed samples: 3104 | elapsed time per iteration (ms): 13631.4 | learning rate: 8.609E-07 | global batch size: 16 | lm loss: 8.574228E+00 | loss scale: 4096.0 | grad norm: 141638.439 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 195/ 159576 | consumed samples: 3120 | elapsed time per iteration (ms): 13610.1 | learning rate: 8.654E-07 | global batch size: 16 | lm loss: 8.461662E+00 | loss scale: 4096.0 | grad norm: 102623.541 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 196/ 159576 | consumed samples: 3136 | elapsed time per iteration (ms): 13581.2 | learning rate: 8.698E-07 | global batch size: 16 | lm loss: 8.478310E+00 | loss scale: 4096.0 | grad norm: 64740.797 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 197/ 159576 | consumed samples: 3152 | elapsed time per iteration (ms): 13626.3 | learning rate: 8.743E-07 | global batch size: 16 | lm loss: 8.468125E+00 | loss scale: 4096.0 | grad norm: 113590.460 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 198/ 159576 | consumed samples: 3168 | elapsed time per iteration (ms): 14045.8 | learning rate: 8.787E-07 | global batch size: 16 | lm loss: 8.800446E+00 | loss scale: 4096.0 | grad norm: 157117.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 199/ 159576 | consumed samples: 3184 | elapsed time per iteration (ms): 13670.2 | learning rate: 8.831E-07 | global batch size: 16 | lm loss: 8.530574E+00 | loss scale: 4096.0 | grad norm: 71020.347 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 200/ 159576 | consumed samples: 3200 | elapsed time per iteration (ms): 13673.4 | learning rate: 8.876E-07 | global batch size: 16 | lm loss: 8.573134E+00 | loss scale: 4096.0 | grad norm: 68974.846 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 201/ 159576 | consumed samples: 3216 | elapsed time per iteration (ms): 13793.0 | learning rate: 8.920E-07 | global batch size: 16 | lm loss: 8.408599E+00 | loss scale: 4096.0 | grad norm: 69080.768 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 202/ 159576 | consumed samples: 3232 | elapsed time per iteration (ms): 13826.3 | learning rate: 8.964E-07 | global batch size: 16 | lm loss: 8.511511E+00 | loss scale: 4096.0 | grad norm: 111260.930 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 203/ 159576 | consumed samples: 3248 | elapsed time per iteration (ms): 13532.8 | learning rate: 9.009E-07 | global batch size: 16 | lm loss: 8.359414E+00 | loss scale: 4096.0 | grad norm: 178104.845 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 204/ 159576 | consumed samples: 3264 | elapsed time per iteration (ms): 13664.5 | learning rate: 9.053E-07 | global batch size: 16 | lm loss: 8.641071E+00 | loss scale: 4096.0 | grad norm: 200697.121 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 205/ 159576 | consumed samples: 3280 | elapsed time per iteration (ms): 13644.0 | learning rate: 9.098E-07 | global batch size: 16 | lm loss: 8.579686E+00 | loss scale: 4096.0 | grad norm: 127286.357 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 206/ 159576 | consumed samples: 3296 | elapsed time per iteration (ms): 14372.0 | learning rate: 9.142E-07 | global batch size: 16 | lm loss: 8.340457E+00 | loss scale: 4096.0 | grad norm: 79901.241 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 207/ 159576 | consumed samples: 3312 | elapsed time per iteration (ms): 13542.0 | learning rate: 9.186E-07 | global batch size: 16 | lm loss: 8.573874E+00 | loss scale: 4096.0 | grad norm: 54182.244 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 208/ 159576 | consumed samples: 3328 | elapsed time per iteration (ms): 13770.4 | learning rate: 9.231E-07 | global batch size: 16 | lm loss: 8.671753E+00 | loss scale: 4096.0 | grad norm: 118528.691 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 209/ 159576 | consumed samples: 3344 | elapsed time per iteration (ms): 13735.7 | learning rate: 9.275E-07 | global batch size: 16 | lm loss: 8.323320E+00 | loss scale: 4096.0 | grad norm: 84996.612 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 210/ 159576 | consumed samples: 3360 | elapsed time per iteration (ms): 13465.7 | learning rate: 9.320E-07 | global batch size: 16 | lm loss: 8.521966E+00 | loss scale: 4096.0 | grad norm: 58490.816 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 211/ 159576 | consumed samples: 3376 | elapsed time per iteration (ms): 14045.3 | learning rate: 9.364E-07 | global batch size: 16 | lm loss: 8.366361E+00 | loss scale: 4096.0 | grad norm: 60420.660 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 212/ 159576 | consumed samples: 3392 | elapsed time per iteration (ms): 13641.0 | learning rate: 9.408E-07 | global batch size: 16 | lm loss: 8.510538E+00 | loss scale: 4096.0 | grad norm: 107003.263 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 213/ 159576 | consumed samples: 3408 | elapsed time per iteration (ms): 13705.1 | learning rate: 9.453E-07 | global batch size: 16 | lm loss: 8.749462E+00 | loss scale: 4096.0 | grad norm: 127548.939 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 214/ 159576 | consumed samples: 3424 | elapsed time per iteration (ms): 13700.1 | learning rate: 9.497E-07 | global batch size: 16 | lm loss: 8.406161E+00 | loss scale: 4096.0 | grad norm: 77133.513 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 215/ 159576 | consumed samples: 3440 | elapsed time per iteration (ms): 14278.2 | learning rate: 9.541E-07 | global batch size: 16 | lm loss: 8.418405E+00 | loss scale: 4096.0 | grad norm: 62254.176 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 216/ 159576 | consumed samples: 3456 | elapsed time per iteration (ms): 13592.8 | learning rate: 9.586E-07 | global batch size: 16 | lm loss: 8.472538E+00 | loss scale: 4096.0 | grad norm: 50530.895 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 217/ 159576 | consumed samples: 3472 | elapsed time per iteration (ms): 13518.7 | learning rate: 9.630E-07 | global batch size: 16 | lm loss: 8.448650E+00 | loss scale: 4096.0 | grad norm: 80646.746 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 218/ 159576 | consumed samples: 3488 | elapsed time per iteration (ms): 13661.2 | learning rate: 9.675E-07 | global batch size: 16 | lm loss: 7.734177E+00 | loss scale: 4096.0 | grad norm: 149486.567 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 219/ 159576 | consumed samples: 3504 | elapsed time per iteration (ms): 14068.7 | learning rate: 9.719E-07 | global batch size: 16 | lm loss: 8.294590E+00 | loss scale: 4096.0 | grad norm: 56571.951 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 220/ 159576 | consumed samples: 3520 | elapsed time per iteration (ms): 13630.3 | learning rate: 9.763E-07 | global batch size: 16 | lm loss: 8.257124E+00 | loss scale: 4096.0 | grad norm: 62046.509 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 221/ 159576 | consumed samples: 3536 | elapsed time per iteration (ms): 13703.1 | learning rate: 9.808E-07 | global batch size: 16 | lm loss: 8.288898E+00 | loss scale: 4096.0 | grad norm: 59852.189 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 222/ 159576 | consumed samples: 3552 | elapsed time per iteration (ms): 13772.5 | learning rate: 9.852E-07 | global batch size: 16 | lm loss: 8.155066E+00 | loss scale: 4096.0 | grad norm: 58014.079 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 223/ 159576 | consumed samples: 3568 | elapsed time per iteration (ms): 13771.9 | learning rate: 9.896E-07 | global batch size: 16 | lm loss: 8.263331E+00 | loss scale: 4096.0 | grad norm: 63268.461 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 224/ 159576 | consumed samples: 3584 | elapsed time per iteration (ms): 14010.9 | learning rate: 9.941E-07 | global batch size: 16 | lm loss: 8.163802E+00 | loss scale: 4096.0 | grad norm: 57272.250 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 225/ 159576 | consumed samples: 3600 | elapsed time per iteration (ms): 13593.2 | learning rate: 9.985E-07 | global batch size: 16 | lm loss: 8.163125E+00 | loss scale: 4096.0 | grad norm: 42586.571 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 226/ 159576 | consumed samples: 3616 | elapsed time per iteration (ms): 13655.1 | learning rate: 1.003E-06 | global batch size: 16 | lm loss: 8.360060E+00 | loss scale: 4096.0 | grad norm: 122218.171 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 227/ 159576 | consumed samples: 3632 | elapsed time per iteration (ms): 13648.6 | learning rate: 1.007E-06 | global batch size: 16 | lm loss: 8.255043E+00 | loss scale: 4096.0 | grad norm: 85521.599 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 228/ 159576 | consumed samples: 3648 | elapsed time per iteration (ms): 14030.4 | learning rate: 1.012E-06 | global batch size: 16 | lm loss: 8.261985E+00 | loss scale: 4096.0 | grad norm: 67005.701 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 229/ 159576 | consumed samples: 3664 | elapsed time per iteration (ms): 13712.9 | learning rate: 1.016E-06 | global batch size: 16 | lm loss: 8.186491E+00 | loss scale: 4096.0 | grad norm: 56484.916 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 230/ 159576 | consumed samples: 3680 | elapsed time per iteration (ms): 13908.9 | learning rate: 1.021E-06 | global batch size: 16 | lm loss: 8.405298E+00 | loss scale: 4096.0 | grad norm: 76846.855 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 231/ 159576 | consumed samples: 3696 | elapsed time per iteration (ms): 13436.7 | learning rate: 1.025E-06 | global batch size: 16 | lm loss: 8.396565E+00 | loss scale: 4096.0 | grad norm: 65903.685 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 232/ 159576 | consumed samples: 3712 | elapsed time per iteration (ms): 13847.3 | learning rate: 1.030E-06 | global batch size: 16 | lm loss: 8.280029E+00 | loss scale: 4096.0 | grad norm: 49376.518 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 233/ 159576 | consumed samples: 3728 | elapsed time per iteration (ms): 13817.4 | learning rate: 1.034E-06 | global batch size: 16 | lm loss: 8.356775E+00 | loss scale: 4096.0 | grad norm: 59866.023 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 234/ 159576 | consumed samples: 3744 | elapsed time per iteration (ms): 13586.3 | learning rate: 1.038E-06 | global batch size: 16 | lm loss: 8.429869E+00 | loss scale: 4096.0 | grad norm: 177436.133 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 235/ 159576 | consumed samples: 3760 | elapsed time per iteration (ms): 13599.7 | learning rate: 1.043E-06 | global batch size: 16 | lm loss: 8.434436E+00 | loss scale: 4096.0 | grad norm: 135413.910 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 236/ 159576 | consumed samples: 3776 | elapsed time per iteration (ms): 13650.1 | learning rate: 1.047E-06 | global batch size: 16 | lm loss: 8.271558E+00 | loss scale: 4096.0 | grad norm: 90861.034 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 237/ 159576 | consumed samples: 3792 | elapsed time per iteration (ms): 14163.4 | learning rate: 1.052E-06 | global batch size: 16 | lm loss: 8.303068E+00 | loss scale: 4096.0 | grad norm: 54299.730 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 238/ 159576 | consumed samples: 3808 | elapsed time per iteration (ms): 13595.2 | learning rate: 1.056E-06 | global batch size: 16 | lm loss: 8.246891E+00 | loss scale: 4096.0 | grad norm: 58398.807 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 239/ 159576 | consumed samples: 3824 | elapsed time per iteration (ms): 13633.1 | learning rate: 1.061E-06 | global batch size: 16 | lm loss: 8.223282E+00 | loss scale: 4096.0 | grad norm: 58574.140 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 240/ 159576 | consumed samples: 3840 | elapsed time per iteration (ms): 13623.5 | learning rate: 1.065E-06 | global batch size: 16 | lm loss: 8.408007E+00 | loss scale: 4096.0 | grad norm: 128668.081 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 241/ 159576 | consumed samples: 3856 | elapsed time per iteration (ms): 14073.7 | learning rate: 1.070E-06 | global batch size: 16 | lm loss: 8.490035E+00 | loss scale: 4096.0 | grad norm: 228763.576 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 242/ 159576 | consumed samples: 3872 | elapsed time per iteration (ms): 13568.7 | learning rate: 1.074E-06 | global batch size: 16 | lm loss: 8.217072E+00 | loss scale: 4096.0 | grad norm: 54955.773 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 243/ 159576 | consumed samples: 3888 | elapsed time per iteration (ms): 13649.7 | learning rate: 1.078E-06 | global batch size: 16 | lm loss: 8.280759E+00 | loss scale: 4096.0 | grad norm: 70277.633 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 244/ 159576 | consumed samples: 3904 | elapsed time per iteration (ms): 13743.3 | learning rate: 1.083E-06 | global batch size: 16 | lm loss: 8.266622E+00 | loss scale: 4096.0 | grad norm: 52088.661 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 245/ 159576 | consumed samples: 3920 | elapsed time per iteration (ms): 13760.9 | learning rate: 1.087E-06 | global batch size: 16 | lm loss: 8.186391E+00 | loss scale: 4096.0 | grad norm: 45303.389 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 246/ 159576 | consumed samples: 3936 | elapsed time per iteration (ms): 13869.6 | learning rate: 1.092E-06 | global batch size: 16 | lm loss: 8.217053E+00 | loss scale: 4096.0 | grad norm: 66052.613 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 247/ 159576 | consumed samples: 3952 | elapsed time per iteration (ms): 13595.0 | learning rate: 1.096E-06 | global batch size: 16 | lm loss: 8.218720E+00 | loss scale: 4096.0 | grad norm: 63154.139 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 248/ 159576 | consumed samples: 3968 | elapsed time per iteration (ms): 13605.0 | learning rate: 1.101E-06 | global batch size: 16 | lm loss: 8.214328E+00 | loss scale: 4096.0 | grad norm: 54827.602 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 249/ 159576 | consumed samples: 3984 | elapsed time per iteration (ms): 13572.6 | learning rate: 1.105E-06 | global batch size: 16 | lm loss: 8.289627E+00 | loss scale: 4096.0 | grad norm: 112939.295 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 250/ 159576 | consumed samples: 4000 | elapsed time per iteration (ms): 13869.8 | learning rate: 1.109E-06 | global batch size: 16 | lm loss: 8.362014E+00 | loss scale: 4096.0 | grad norm: 56746.466 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 251/ 159576 | consumed samples: 4016 | elapsed time per iteration (ms): 13620.5 | learning rate: 1.114E-06 | global batch size: 16 | lm loss: 8.189938E+00 | loss scale: 4096.0 | grad norm: 56152.282 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 252/ 159576 | consumed samples: 4032 | elapsed time per iteration (ms): 13708.2 | learning rate: 1.118E-06 | global batch size: 16 | lm loss: 8.356908E+00 | loss scale: 4096.0 | grad norm: 78498.467 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 253/ 159576 | consumed samples: 4048 | elapsed time per iteration (ms): 13478.4 | learning rate: 1.123E-06 | global batch size: 16 | lm loss: 8.047684E+00 | loss scale: 4096.0 | grad norm: 66252.882 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 254/ 159576 | consumed samples: 4064 | elapsed time per iteration (ms): 14231.8 | learning rate: 1.127E-06 | global batch size: 16 | lm loss: 8.279363E+00 | loss scale: 4096.0 | grad norm: 85125.935 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 255/ 159576 | consumed samples: 4080 | elapsed time per iteration (ms): 13522.4 | learning rate: 1.132E-06 | global batch size: 16 | lm loss: 8.159877E+00 | loss scale: 4096.0 | grad norm: 48952.267 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 256/ 159576 | consumed samples: 4096 | elapsed time per iteration (ms): 13553.5 | learning rate: 1.136E-06 | global batch size: 16 | lm loss: 8.154376E+00 | loss scale: 4096.0 | grad norm: 41715.920 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 257/ 159576 | consumed samples: 4112 | elapsed time per iteration (ms): 13537.5 | learning rate: 1.141E-06 | global batch size: 16 | lm loss: 8.247561E+00 | loss scale: 4096.0 | grad norm: 57864.708 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 258/ 159576 | consumed samples: 4128 | elapsed time per iteration (ms): 13659.5 | learning rate: 1.145E-06 | global batch size: 16 | lm loss: 8.167631E+00 | loss scale: 4096.0 | grad norm: 45439.745 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 259/ 159576 | consumed samples: 4144 | elapsed time per iteration (ms): 14023.4 | learning rate: 1.149E-06 | global batch size: 16 | lm loss: 8.081510E+00 | loss scale: 4096.0 | grad norm: 54108.939 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 260/ 159576 | consumed samples: 4160 | elapsed time per iteration (ms): 13447.5 | learning rate: 1.154E-06 | global batch size: 16 | lm loss: 8.074065E+00 | loss scale: 4096.0 | grad norm: 45799.989 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 261/ 159576 | consumed samples: 4176 | elapsed time per iteration (ms): 13604.0 | learning rate: 1.158E-06 | global batch size: 16 | lm loss: 8.134088E+00 | loss scale: 4096.0 | grad norm: 34426.421 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 262/ 159576 | consumed samples: 4192 | elapsed time per iteration (ms): 13632.5 | learning rate: 1.163E-06 | global batch size: 16 | lm loss: 8.331153E+00 | loss scale: 4096.0 | grad norm: 241742.321 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 263/ 159576 | consumed samples: 4208 | elapsed time per iteration (ms): 14049.0 | learning rate: 1.167E-06 | global batch size: 16 | lm loss: 8.300336E+00 | loss scale: 4096.0 | grad norm: 89382.639 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 264/ 159576 | consumed samples: 4224 | elapsed time per iteration (ms): 13554.0 | learning rate: 1.172E-06 | global batch size: 16 | lm loss: 8.285131E+00 | loss scale: 4096.0 | grad norm: 56471.162 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 265/ 159576 | consumed samples: 4240 | elapsed time per iteration (ms): 13594.4 | learning rate: 1.176E-06 | global batch size: 16 | lm loss: 8.247953E+00 | loss scale: 4096.0 | grad norm: 59934.542 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 266/ 159576 | consumed samples: 4256 | elapsed time per iteration (ms): 13722.5 | learning rate: 1.180E-06 | global batch size: 16 | lm loss: 8.086367E+00 | loss scale: 4096.0 | grad norm: 49794.894 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 267/ 159576 | consumed samples: 4272 | elapsed time per iteration (ms): 13925.6 | learning rate: 1.185E-06 | global batch size: 16 | lm loss: 8.364625E+00 | loss scale: 4096.0 | grad norm: 198667.364 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 268/ 159576 | consumed samples: 4288 | elapsed time per iteration (ms): 13685.9 | learning rate: 1.189E-06 | global batch size: 16 | lm loss: 8.378025E+00 | loss scale: 4096.0 | grad norm: 206726.678 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 269/ 159576 | consumed samples: 4304 | elapsed time per iteration (ms): 13784.2 | learning rate: 1.194E-06 | global batch size: 16 | lm loss: 8.309950E+00 | loss scale: 4096.0 | grad norm: 102692.516 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 270/ 159576 | consumed samples: 4320 | elapsed time per iteration (ms): 13426.6 | learning rate: 1.198E-06 | global batch size: 16 | lm loss: 8.437682E+00 | loss scale: 4096.0 | grad norm: 53779.480 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 271/ 159576 | consumed samples: 4336 | elapsed time per iteration (ms): 13590.5 | learning rate: 1.203E-06 | global batch size: 16 | lm loss: 8.180303E+00 | loss scale: 4096.0 | grad norm: 41837.204 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 272/ 159576 | consumed samples: 4352 | elapsed time per iteration (ms): 13918.1 | learning rate: 1.207E-06 | global batch size: 16 | lm loss: 8.269817E+00 | loss scale: 4096.0 | grad norm: 60250.869 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 273/ 159576 | consumed samples: 4368 | elapsed time per iteration (ms): 13764.9 | learning rate: 1.212E-06 | global batch size: 16 | lm loss: 8.196259E+00 | loss scale: 4096.0 | grad norm: 51310.508 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 274/ 159576 | consumed samples: 4384 | elapsed time per iteration (ms): 13543.7 | learning rate: 1.216E-06 | global batch size: 16 | lm loss: 8.111527E+00 | loss scale: 4096.0 | grad norm: 62869.218 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 275/ 159576 | consumed samples: 4400 | elapsed time per iteration (ms): 13741.6 | learning rate: 1.220E-06 | global batch size: 16 | lm loss: 8.196915E+00 | loss scale: 4096.0 | grad norm: 56382.422 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 276/ 159576 | consumed samples: 4416 | elapsed time per iteration (ms): 14418.6 | learning rate: 1.225E-06 | global batch size: 16 | lm loss: 8.163618E+00 | loss scale: 4096.0 | grad norm: 59897.745 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 277/ 159576 | consumed samples: 4432 | elapsed time per iteration (ms): 13488.6 | learning rate: 1.229E-06 | global batch size: 16 | lm loss: 8.232466E+00 | loss scale: 4096.0 | grad norm: 106883.652 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 278/ 159576 | consumed samples: 4448 | elapsed time per iteration (ms): 13680.7 | learning rate: 1.234E-06 | global batch size: 16 | lm loss: 8.285415E+00 | loss scale: 4096.0 | grad norm: 52155.013 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 279/ 159576 | consumed samples: 4464 | elapsed time per iteration (ms): 13663.3 | learning rate: 1.238E-06 | global batch size: 16 | lm loss: 8.221471E+00 | loss scale: 4096.0 | grad norm: 43151.453 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 280/ 159576 | consumed samples: 4480 | elapsed time per iteration (ms): 13783.3 | learning rate: 1.243E-06 | global batch size: 16 | lm loss: 7.827011E+00 | loss scale: 4096.0 | grad norm: 60081.852 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 281/ 159576 | consumed samples: 4496 | elapsed time per iteration (ms): 13993.1 | learning rate: 1.247E-06 | global batch size: 16 | lm loss: 8.016405E+00 | loss scale: 4096.0 | grad norm: 60969.434 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 282/ 159576 | consumed samples: 4512 | elapsed time per iteration (ms): 13747.2 | learning rate: 1.251E-06 | global batch size: 16 | lm loss: 8.205744E+00 | loss scale: 4096.0 | grad norm: 64657.162 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 283/ 159576 | consumed samples: 4528 | elapsed time per iteration (ms): 13732.1 | learning rate: 1.256E-06 | global batch size: 16 | lm loss: 8.225381E+00 | loss scale: 4096.0 | grad norm: 46007.720 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 284/ 159576 | consumed samples: 4544 | elapsed time per iteration (ms): 13701.8 | learning rate: 1.260E-06 | global batch size: 16 | lm loss: 8.069484E+00 | loss scale: 4096.0 | grad norm: 50539.571 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 285/ 159576 | consumed samples: 4560 | elapsed time per iteration (ms): 13774.1 | learning rate: 1.265E-06 | global batch size: 16 | lm loss: 8.313256E+00 | loss scale: 4096.0 | grad norm: 75301.930 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 286/ 159576 | consumed samples: 4576 | elapsed time per iteration (ms): 13700.1 | learning rate: 1.269E-06 | global batch size: 16 | lm loss: 8.296308E+00 | loss scale: 4096.0 | grad norm: 109402.142 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 287/ 159576 | consumed samples: 4592 | elapsed time per iteration (ms): 13678.1 | learning rate: 1.274E-06 | global batch size: 16 | lm loss: 8.245502E+00 | loss scale: 4096.0 | grad norm: 53639.635 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 288/ 159576 | consumed samples: 4608 | elapsed time per iteration (ms): 13698.6 | learning rate: 1.278E-06 | global batch size: 16 | lm loss: 8.137961E+00 | loss scale: 4096.0 | grad norm: 42750.465 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 289/ 159576 | consumed samples: 4624 | elapsed time per iteration (ms): 14172.7 | learning rate: 1.283E-06 | global batch size: 16 | lm loss: 8.187901E+00 | loss scale: 4096.0 | grad norm: 108265.490 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 290/ 159576 | consumed samples: 4640 | elapsed time per iteration (ms): 13663.7 | learning rate: 1.287E-06 | global batch size: 16 | lm loss: 8.092007E+00 | loss scale: 4096.0 | grad norm: 61613.623 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 291/ 159576 | consumed samples: 4656 | elapsed time per iteration (ms): 13802.2 | learning rate: 1.291E-06 | global batch size: 16 | lm loss: 8.140871E+00 | loss scale: 4096.0 | grad norm: 73138.188 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 292/ 159576 | consumed samples: 4672 | elapsed time per iteration (ms): 13588.8 | learning rate: 1.296E-06 | global batch size: 16 | lm loss: 8.096482E+00 | loss scale: 4096.0 | grad norm: 56947.365 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 293/ 159576 | consumed samples: 4688 | elapsed time per iteration (ms): 13692.3 | learning rate: 1.300E-06 | global batch size: 16 | lm loss: 8.261303E+00 | loss scale: 4096.0 | grad norm: 50306.115 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 294/ 159576 | consumed samples: 4704 | elapsed time per iteration (ms): 13953.1 | learning rate: 1.305E-06 | global batch size: 16 | lm loss: 8.088846E+00 | loss scale: 4096.0 | grad norm: 70651.882 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 295/ 159576 | consumed samples: 4720 | elapsed time per iteration (ms): 13681.7 | learning rate: 1.309E-06 | global batch size: 16 | lm loss: 8.216883E+00 | loss scale: 4096.0 | grad norm: 109748.850 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 296/ 159576 | consumed samples: 4736 | elapsed time per iteration (ms): 13680.1 | learning rate: 1.314E-06 | global batch size: 16 | lm loss: 8.011025E+00 | loss scale: 4096.0 | grad norm: 57863.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 297/ 159576 | consumed samples: 4752 | elapsed time per iteration (ms): 13766.7 | learning rate: 1.318E-06 | global batch size: 16 | lm loss: 8.023094E+00 | loss scale: 4096.0 | grad norm: 39732.348 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 298/ 159576 | consumed samples: 4768 | elapsed time per iteration (ms): 14056.0 | learning rate: 1.322E-06 | global batch size: 16 | lm loss: 8.085699E+00 | loss scale: 4096.0 | grad norm: 93534.410 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 299/ 159576 | consumed samples: 4784 | elapsed time per iteration (ms): 13507.1 | learning rate: 1.327E-06 | global batch size: 16 | lm loss: 8.410425E+00 | loss scale: 4096.0 | grad norm: 42550.581 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 300/ 159576 | consumed samples: 4800 | elapsed time per iteration (ms): 13670.9 | learning rate: 1.331E-06 | global batch size: 16 | lm loss: 8.125405E+00 | loss scale: 4096.0 | grad norm: 37244.445 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 301/ 159576 | consumed samples: 4816 | elapsed time per iteration (ms): 13643.0 | learning rate: 1.336E-06 | global batch size: 16 | lm loss: 7.945562E+00 | loss scale: 4096.0 | grad norm: 37921.680 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 302/ 159576 | consumed samples: 4832 | elapsed time per iteration (ms): 14097.2 | learning rate: 1.340E-06 | global batch size: 16 | lm loss: 8.073545E+00 | loss scale: 4096.0 | grad norm: 80879.552 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 303/ 159576 | consumed samples: 4848 | elapsed time per iteration (ms): 13625.2 | learning rate: 1.345E-06 | global batch size: 16 | lm loss: 8.224352E+00 | loss scale: 4096.0 | grad norm: 75920.356 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 304/ 159576 | consumed samples: 4864 | elapsed time per iteration (ms): 13709.0 | learning rate: 1.349E-06 | global batch size: 16 | lm loss: 8.025059E+00 | loss scale: 4096.0 | grad norm: 39535.605 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 305/ 159576 | consumed samples: 4880 | elapsed time per iteration (ms): 13741.5 | learning rate: 1.354E-06 | global batch size: 16 | lm loss: 8.094482E+00 | loss scale: 4096.0 | grad norm: 40630.922 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 306/ 159576 | consumed samples: 4896 | elapsed time per iteration (ms): 13523.7 | learning rate: 1.358E-06 | global batch size: 16 | lm loss: 8.135887E+00 | loss scale: 4096.0 | grad norm: 80825.550 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 307/ 159576 | consumed samples: 4912 | elapsed time per iteration (ms): 14093.4 | learning rate: 1.362E-06 | global batch size: 16 | lm loss: 8.292034E+00 | loss scale: 4096.0 | grad norm: 86171.888 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 308/ 159576 | consumed samples: 4928 | elapsed time per iteration (ms): 13647.9 | learning rate: 1.367E-06 | global batch size: 16 | lm loss: 8.204563E+00 | loss scale: 4096.0 | grad norm: 46698.010 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 309/ 159576 | consumed samples: 4944 | elapsed time per iteration (ms): 13637.2 | learning rate: 1.371E-06 | global batch size: 16 | lm loss: 8.033182E+00 | loss scale: 4096.0 | grad norm: 42089.185 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 310/ 159576 | consumed samples: 4960 | elapsed time per iteration (ms): 13700.6 | learning rate: 1.376E-06 | global batch size: 16 | lm loss: 8.048797E+00 | loss scale: 4096.0 | grad norm: 56022.805 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 311/ 159576 | consumed samples: 4976 | elapsed time per iteration (ms): 14085.5 | learning rate: 1.380E-06 | global batch size: 16 | lm loss: 7.623003E+00 | loss scale: 4096.0 | grad norm: 72171.220 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 312/ 159576 | consumed samples: 4992 | elapsed time per iteration (ms): 13830.9 | learning rate: 1.385E-06 | global batch size: 16 | lm loss: 8.082812E+00 | loss scale: 4096.0 | grad norm: 39681.453 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 313/ 159576 | consumed samples: 5008 | elapsed time per iteration (ms): 13533.9 | learning rate: 1.389E-06 | global batch size: 16 | lm loss: 8.116117E+00 | loss scale: 4096.0 | grad norm: 33726.889 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 314/ 159576 | consumed samples: 5024 | elapsed time per iteration (ms): 13637.3 | learning rate: 1.393E-06 | global batch size: 16 | lm loss: 8.210217E+00 | loss scale: 4096.0 | grad norm: 89402.073 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 315/ 159576 | consumed samples: 5040 | elapsed time per iteration (ms): 14136.6 | learning rate: 1.398E-06 | global batch size: 16 | lm loss: 7.798199E+00 | loss scale: 4096.0 | grad norm: 83566.570 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 316/ 159576 | consumed samples: 5056 | elapsed time per iteration (ms): 13651.3 | learning rate: 1.402E-06 | global batch size: 16 | lm loss: 8.066372E+00 | loss scale: 4096.0 | grad norm: 38768.697 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 317/ 159576 | consumed samples: 5072 | elapsed time per iteration (ms): 13641.7 | learning rate: 1.407E-06 | global batch size: 16 | lm loss: 7.876265E+00 | loss scale: 4096.0 | grad norm: 36174.406 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 318/ 159576 | consumed samples: 5088 | elapsed time per iteration (ms): 13653.8 | learning rate: 1.411E-06 | global batch size: 16 | lm loss: 7.979768E+00 | loss scale: 4096.0 | grad norm: 66651.391 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 319/ 159576 | consumed samples: 5104 | elapsed time per iteration (ms): 13755.9 | learning rate: 1.416E-06 | global batch size: 16 | lm loss: 8.094232E+00 | loss scale: 4096.0 | grad norm: 79088.558 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 320/ 159576 | consumed samples: 5120 | elapsed time per iteration (ms): 13900.8 | learning rate: 1.420E-06 | global batch size: 16 | lm loss: 8.113304E+00 | loss scale: 4096.0 | grad norm: 52331.401 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 321/ 159576 | consumed samples: 5136 | elapsed time per iteration (ms): 13649.9 | learning rate: 1.425E-06 | global batch size: 16 | lm loss: 8.128990E+00 | loss scale: 4096.0 | grad norm: 46927.679 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 322/ 159576 | consumed samples: 5152 | elapsed time per iteration (ms): 13693.6 | learning rate: 1.429E-06 | global batch size: 16 | lm loss: 8.486778E+00 | loss scale: 4096.0 | grad norm: 89462.672 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 323/ 159576 | consumed samples: 5168 | elapsed time per iteration (ms): 13699.8 | learning rate: 1.433E-06 | global batch size: 16 | lm loss: 8.051263E+00 | loss scale: 4096.0 | grad norm: 42680.523 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 324/ 159576 | consumed samples: 5184 | elapsed time per iteration (ms): 14041.8 | learning rate: 1.438E-06 | global batch size: 16 | lm loss: 8.181097E+00 | loss scale: 4096.0 | grad norm: 43801.136 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 325/ 159576 | consumed samples: 5200 | elapsed time per iteration (ms): 13711.0 | learning rate: 1.442E-06 | global batch size: 16 | lm loss: 8.171723E+00 | loss scale: 4096.0 | grad norm: 47748.407 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 326/ 159576 | consumed samples: 5216 | elapsed time per iteration (ms): 13743.3 | learning rate: 1.447E-06 | global batch size: 16 | lm loss: 8.035454E+00 | loss scale: 4096.0 | grad norm: 58353.227 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 327/ 159576 | consumed samples: 5232 | elapsed time per iteration (ms): 13602.7 | learning rate: 1.451E-06 | global batch size: 16 | lm loss: 8.021453E+00 | loss scale: 4096.0 | grad norm: 44165.609 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 328/ 159576 | consumed samples: 5248 | elapsed time per iteration (ms): 13748.9 | learning rate: 1.456E-06 | global batch size: 16 | lm loss: 8.051726E+00 | loss scale: 4096.0 | grad norm: 35138.807 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 329/ 159576 | consumed samples: 5264 | elapsed time per iteration (ms): 13961.7 | learning rate: 1.460E-06 | global batch size: 16 | lm loss: 7.960547E+00 | loss scale: 4096.0 | grad norm: 41197.060 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 330/ 159576 | consumed samples: 5280 | elapsed time per iteration (ms): 13633.4 | learning rate: 1.464E-06 | global batch size: 16 | lm loss: 8.084079E+00 | loss scale: 4096.0 | grad norm: 43199.182 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 331/ 159576 | consumed samples: 5296 | elapsed time per iteration (ms): 13678.9 | learning rate: 1.469E-06 | global batch size: 16 | lm loss: 8.243130E+00 | loss scale: 4096.0 | grad norm: 39935.584 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 332/ 159576 | consumed samples: 5312 | elapsed time per iteration (ms): 13653.3 | learning rate: 1.473E-06 | global batch size: 16 | lm loss: 8.148146E+00 | loss scale: 4096.0 | grad norm: 31710.971 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 333/ 159576 | consumed samples: 5328 | elapsed time per iteration (ms): 13982.9 | learning rate: 1.478E-06 | global batch size: 16 | lm loss: 8.055049E+00 | loss scale: 4096.0 | grad norm: 40555.458 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 334/ 159576 | consumed samples: 5344 | elapsed time per iteration (ms): 13576.5 | learning rate: 1.482E-06 | global batch size: 16 | lm loss: 8.154724E+00 | loss scale: 4096.0 | grad norm: 98189.157 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 335/ 159576 | consumed samples: 5360 | elapsed time per iteration (ms): 13666.3 | learning rate: 1.487E-06 | global batch size: 16 | lm loss: 8.056485E+00 | loss scale: 4096.0 | grad norm: 53277.066 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 336/ 159576 | consumed samples: 5376 | elapsed time per iteration (ms): 13667.7 | learning rate: 1.491E-06 | global batch size: 16 | lm loss: 7.902112E+00 | loss scale: 4096.0 | grad norm: 35520.620 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 337/ 159576 | consumed samples: 5392 | elapsed time per iteration (ms): 14189.1 | learning rate: 1.496E-06 | global batch size: 16 | lm loss: 8.211933E+00 | loss scale: 4096.0 | grad norm: 102636.452 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 338/ 159576 | consumed samples: 5408 | elapsed time per iteration (ms): 13538.3 | learning rate: 1.500E-06 | global batch size: 16 | lm loss: 8.077993E+00 | loss scale: 4096.0 | grad norm: 74161.424 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 339/ 159576 | consumed samples: 5424 | elapsed time per iteration (ms): 13690.1 | learning rate: 1.504E-06 | global batch size: 16 | lm loss: 8.002722E+00 | loss scale: 4096.0 | grad norm: 41178.202 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 340/ 159576 | consumed samples: 5440 | elapsed time per iteration (ms): 13761.4 | learning rate: 1.509E-06 | global batch size: 16 | lm loss: 8.070647E+00 | loss scale: 4096.0 | grad norm: 146660.160 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 341/ 159576 | consumed samples: 5456 | elapsed time per iteration (ms): 13679.6 | learning rate: 1.513E-06 | global batch size: 16 | lm loss: 8.211810E+00 | loss scale: 4096.0 | grad norm: 56011.276 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 342/ 159576 | consumed samples: 5472 | elapsed time per iteration (ms): 13958.7 | learning rate: 1.518E-06 | global batch size: 16 | lm loss: 8.028828E+00 | loss scale: 4096.0 | grad norm: 45507.509 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 343/ 159576 | consumed samples: 5488 | elapsed time per iteration (ms): 13796.1 | learning rate: 1.522E-06 | global batch size: 16 | lm loss: 8.000618E+00 | loss scale: 4096.0 | grad norm: 41366.016 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 344/ 159576 | consumed samples: 5504 | elapsed time per iteration (ms): 13566.5 | learning rate: 1.527E-06 | global batch size: 16 | lm loss: 8.106353E+00 | loss scale: 4096.0 | grad norm: 86487.826 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 345/ 159576 | consumed samples: 5520 | elapsed time per iteration (ms): 13617.7 | learning rate: 1.531E-06 | global batch size: 16 | lm loss: 8.130958E+00 | loss scale: 4096.0 | grad norm: 65559.636 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 346/ 159576 | consumed samples: 5536 | elapsed time per iteration (ms): 14006.3 | learning rate: 1.536E-06 | global batch size: 16 | lm loss: 8.100373E+00 | loss scale: 4096.0 | grad norm: 50918.888 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 347/ 159576 | consumed samples: 5552 | elapsed time per iteration (ms): 13652.0 | learning rate: 1.540E-06 | global batch size: 16 | lm loss: 8.193462E+00 | loss scale: 4096.0 | grad norm: 49482.923 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 348/ 159576 | consumed samples: 5568 | elapsed time per iteration (ms): 13785.4 | learning rate: 1.544E-06 | global batch size: 16 | lm loss: 8.185720E+00 | loss scale: 4096.0 | grad norm: 33616.818 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 349/ 159576 | consumed samples: 5584 | elapsed time per iteration (ms): 13534.7 | learning rate: 1.549E-06 | global batch size: 16 | lm loss: 7.997324E+00 | loss scale: 4096.0 | grad norm: 41224.808 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 350/ 159576 | consumed samples: 5600 | elapsed time per iteration (ms): 14148.0 | learning rate: 1.553E-06 | global batch size: 16 | lm loss: 8.069170E+00 | loss scale: 4096.0 | grad norm: 61139.413 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 351/ 159576 | consumed samples: 5616 | elapsed time per iteration (ms): 13626.0 | learning rate: 1.558E-06 | global batch size: 16 | lm loss: 8.052499E+00 | loss scale: 4096.0 | grad norm: 58965.426 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 352/ 159576 | consumed samples: 5632 | elapsed time per iteration (ms): 13633.5 | learning rate: 1.562E-06 | global batch size: 16 | lm loss: 8.036291E+00 | loss scale: 4096.0 | grad norm: 38820.487 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 353/ 159576 | consumed samples: 5648 | elapsed time per iteration (ms): 13648.6 | learning rate: 1.567E-06 | global batch size: 16 | lm loss: 8.007360E+00 | loss scale: 4096.0 | grad norm: 33342.929 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 354/ 159576 | consumed samples: 5664 | elapsed time per iteration (ms): 13707.0 | learning rate: 1.571E-06 | global batch size: 16 | lm loss: 7.890161E+00 | loss scale: 4096.0 | grad norm: 62589.896 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 355/ 159576 | consumed samples: 5680 | elapsed time per iteration (ms): 14101.4 | learning rate: 1.575E-06 | global batch size: 16 | lm loss: 8.034273E+00 | loss scale: 4096.0 | grad norm: 62100.784 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 356/ 159576 | consumed samples: 5696 | elapsed time per iteration (ms): 13548.4 | learning rate: 1.580E-06 | global batch size: 16 | lm loss: 7.964279E+00 | loss scale: 4096.0 | grad norm: 37283.643 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 357/ 159576 | consumed samples: 5712 | elapsed time per iteration (ms): 13655.3 | learning rate: 1.584E-06 | global batch size: 16 | lm loss: 7.882459E+00 | loss scale: 4096.0 | grad norm: 36278.786 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 358/ 159576 | consumed samples: 5728 | elapsed time per iteration (ms): 13872.1 | learning rate: 1.589E-06 | global batch size: 16 | lm loss: 8.081428E+00 | loss scale: 4096.0 | grad norm: 59624.520 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 359/ 159576 | consumed samples: 5744 | elapsed time per iteration (ms): 13830.3 | learning rate: 1.593E-06 | global batch size: 16 | lm loss: 8.345490E+00 | loss scale: 4096.0 | grad norm: 101818.152 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 360/ 159576 | consumed samples: 5760 | elapsed time per iteration (ms): 13738.3 | learning rate: 1.598E-06 | global batch size: 16 | lm loss: 8.090802E+00 | loss scale: 4096.0 | grad norm: 37735.210 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 361/ 159576 | consumed samples: 5776 | elapsed time per iteration (ms): 13673.7 | learning rate: 1.602E-06 | global batch size: 16 | lm loss: 7.934822E+00 | loss scale: 4096.0 | grad norm: 35051.225 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 362/ 159576 | consumed samples: 5792 | elapsed time per iteration (ms): 13779.0 | learning rate: 1.607E-06 | global batch size: 16 | lm loss: 8.217977E+00 | loss scale: 4096.0 | grad norm: 81671.155 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 363/ 159576 | consumed samples: 5808 | elapsed time per iteration (ms): 14148.6 | learning rate: 1.611E-06 | global batch size: 16 | lm loss: 7.956856E+00 | loss scale: 4096.0 | grad norm: 123728.069 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 364/ 159576 | consumed samples: 5824 | elapsed time per iteration (ms): 13509.6 | learning rate: 1.615E-06 | global batch size: 16 | lm loss: 7.980748E+00 | loss scale: 4096.0 | grad norm: 64323.538 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 365/ 159576 | consumed samples: 5840 | elapsed time per iteration (ms): 13791.1 | learning rate: 1.620E-06 | global batch size: 16 | lm loss: 7.927495E+00 | loss scale: 4096.0 | grad norm: 38595.229 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 366/ 159576 | consumed samples: 5856 | elapsed time per iteration (ms): 13535.8 | learning rate: 1.624E-06 | global batch size: 16 | lm loss: 7.992770E+00 | loss scale: 4096.0 | grad norm: 34786.799 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 367/ 159576 | consumed samples: 5872 | elapsed time per iteration (ms): 13709.6 | learning rate: 1.629E-06 | global batch size: 16 | lm loss: 8.033854E+00 | loss scale: 4096.0 | grad norm: 26681.238 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 368/ 159576 | consumed samples: 5888 | elapsed time per iteration (ms): 13923.8 | learning rate: 1.633E-06 | global batch size: 16 | lm loss: 8.086361E+00 | loss scale: 4096.0 | grad norm: 116063.612 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 369/ 159576 | consumed samples: 5904 | elapsed time per iteration (ms): 13743.2 | learning rate: 1.638E-06 | global batch size: 16 | lm loss: 8.136069E+00 | loss scale: 4096.0 | grad norm: 192843.981 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 370/ 159576 | consumed samples: 5920 | elapsed time per iteration (ms): 13586.5 | learning rate: 1.642E-06 | global batch size: 16 | lm loss: 8.213842E+00 | loss scale: 4096.0 | grad norm: 66749.630 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 371/ 159576 | consumed samples: 5936 | elapsed time per iteration (ms): 13637.5 | learning rate: 1.646E-06 | global batch size: 16 | lm loss: 7.862526E+00 | loss scale: 4096.0 | grad norm: 35628.877 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 372/ 159576 | consumed samples: 5952 | elapsed time per iteration (ms): 14269.3 | learning rate: 1.651E-06 | global batch size: 16 | lm loss: 8.111351E+00 | loss scale: 4096.0 | grad norm: 51284.654 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 373/ 159576 | consumed samples: 5968 | elapsed time per iteration (ms): 13424.8 | learning rate: 1.655E-06 | global batch size: 16 | lm loss: 7.860275E+00 | loss scale: 4096.0 | grad norm: 51885.287 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 374/ 159576 | consumed samples: 5984 | elapsed time per iteration (ms): 13638.9 | learning rate: 1.660E-06 | global batch size: 16 | lm loss: 7.995843E+00 | loss scale: 4096.0 | grad norm: 40982.716 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 375/ 159576 | consumed samples: 6000 | elapsed time per iteration (ms): 13719.8 | learning rate: 1.664E-06 | global batch size: 16 | lm loss: 7.989121E+00 | loss scale: 4096.0 | grad norm: 43694.588 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 376/ 159576 | consumed samples: 6016 | elapsed time per iteration (ms): 13718.2 | learning rate: 1.669E-06 | global batch size: 16 | lm loss: 8.054690E+00 | loss scale: 4096.0 | grad norm: 56142.201 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 377/ 159576 | consumed samples: 6032 | elapsed time per iteration (ms): 14087.0 | learning rate: 1.673E-06 | global batch size: 16 | lm loss: 8.145277E+00 | loss scale: 4096.0 | grad norm: 77837.877 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 378/ 159576 | consumed samples: 6048 | elapsed time per iteration (ms): 13621.7 | learning rate: 1.678E-06 | global batch size: 16 | lm loss: 7.879861E+00 | loss scale: 4096.0 | grad norm: 35054.780 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 379/ 159576 | consumed samples: 6064 | elapsed time per iteration (ms): 13676.7 | learning rate: 1.682E-06 | global batch size: 16 | lm loss: 7.996103E+00 | loss scale: 4096.0 | grad norm: 31871.611 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 380/ 159576 | consumed samples: 6080 | elapsed time per iteration (ms): 13756.2 | learning rate: 1.686E-06 | global batch size: 16 | lm loss: 7.788074E+00 | loss scale: 4096.0 | grad norm: 30378.507 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 381/ 159576 | consumed samples: 6096 | elapsed time per iteration (ms): 13731.7 | learning rate: 1.691E-06 | global batch size: 16 | lm loss: 7.998044E+00 | loss scale: 4096.0 | grad norm: 78167.228 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 382/ 159576 | consumed samples: 6112 | elapsed time per iteration (ms): 13696.8 | learning rate: 1.695E-06 | global batch size: 16 | lm loss: 8.001510E+00 | loss scale: 4096.0 | grad norm: 57981.800 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 383/ 159576 | consumed samples: 6128 | elapsed time per iteration (ms): 13688.0 | learning rate: 1.700E-06 | global batch size: 16 | lm loss: 8.043833E+00 | loss scale: 4096.0 | grad norm: 40631.885 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 384/ 159576 | consumed samples: 6144 | elapsed time per iteration (ms): 13680.4 | learning rate: 1.704E-06 | global batch size: 16 | lm loss: 8.029270E+00 | loss scale: 4096.0 | grad norm: 31579.477 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 385/ 159576 | consumed samples: 6160 | elapsed time per iteration (ms): 14057.5 | learning rate: 1.709E-06 | global batch size: 16 | lm loss: 8.156369E+00 | loss scale: 4096.0 | grad norm: 87842.060 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 386/ 159576 | consumed samples: 6176 | elapsed time per iteration (ms): 13765.1 | learning rate: 1.713E-06 | global batch size: 16 | lm loss: 8.024692E+00 | loss scale: 4096.0 | grad norm: 56881.857 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 387/ 159576 | consumed samples: 6192 | elapsed time per iteration (ms): 13768.8 | learning rate: 1.717E-06 | global batch size: 16 | lm loss: 7.997876E+00 | loss scale: 4096.0 | grad norm: 31105.819 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 388/ 159576 | consumed samples: 6208 | elapsed time per iteration (ms): 13433.5 | learning rate: 1.722E-06 | global batch size: 16 | lm loss: 7.985063E+00 | loss scale: 4096.0 | grad norm: 78090.353 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 389/ 159576 | consumed samples: 6224 | elapsed time per iteration (ms): 13675.2 | learning rate: 1.726E-06 | global batch size: 16 | lm loss: 7.926050E+00 | loss scale: 4096.0 | grad norm: 61534.683 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 390/ 159576 | consumed samples: 6240 | elapsed time per iteration (ms): 13989.4 | learning rate: 1.731E-06 | global batch size: 16 | lm loss: 7.938218E+00 | loss scale: 4096.0 | grad norm: 37749.344 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 391/ 159576 | consumed samples: 6256 | elapsed time per iteration (ms): 13663.4 | learning rate: 1.735E-06 | global batch size: 16 | lm loss: 7.835842E+00 | loss scale: 4096.0 | grad norm: 48700.287 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 392/ 159576 | consumed samples: 6272 | elapsed time per iteration (ms): 13682.5 | learning rate: 1.740E-06 | global batch size: 16 | lm loss: 7.976984E+00 | loss scale: 4096.0 | grad norm: 45273.731 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 393/ 159576 | consumed samples: 6288 | elapsed time per iteration (ms): 13680.3 | learning rate: 1.744E-06 | global batch size: 16 | lm loss: 8.063533E+00 | loss scale: 4096.0 | grad norm: 62966.350 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 394/ 159576 | consumed samples: 6304 | elapsed time per iteration (ms): 14158.6 | learning rate: 1.749E-06 | global batch size: 16 | lm loss: 7.962408E+00 | loss scale: 4096.0 | grad norm: 38917.941 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 395/ 159576 | consumed samples: 6320 | elapsed time per iteration (ms): 13412.3 | learning rate: 1.753E-06 | global batch size: 16 | lm loss: 7.930057E+00 | loss scale: 4096.0 | grad norm: 59046.433 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 396/ 159576 | consumed samples: 6336 | elapsed time per iteration (ms): 13631.9 | learning rate: 1.757E-06 | global batch size: 16 | lm loss: 8.137497E+00 | loss scale: 4096.0 | grad norm: 51299.741 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 397/ 159576 | consumed samples: 6352 | elapsed time per iteration (ms): 13706.0 | learning rate: 1.762E-06 | global batch size: 16 | lm loss: 8.020626E+00 | loss scale: 4096.0 | grad norm: 37056.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 398/ 159576 | consumed samples: 6368 | elapsed time per iteration (ms): 14158.0 | learning rate: 1.766E-06 | global batch size: 16 | lm loss: 8.114269E+00 | loss scale: 4096.0 | grad norm: 64105.827 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 399/ 159576 | consumed samples: 6384 | elapsed time per iteration (ms): 13628.9 | learning rate: 1.771E-06 | global batch size: 16 | lm loss: 8.186448E+00 | loss scale: 4096.0 | grad norm: 55633.908 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) - iteration 400/ 159576 | consumed samples: 6400 | elapsed time per iteration (ms): 13727.5 | learning rate: 1.775E-06 | global batch size: 16 | lm loss: 8.182411E+00 | loss scale: 4096.0 | grad norm: 51312.945 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | -time (ms) +version https://git-lfs.github.com/spec/v1 +oid sha256:2ed4d841c218edb70af5e71866909ccaf9c42c1c5f23c614f76174564e0709c5 +size 67163603