diff --git "a/log_node31.txt" "b/log_node31.txt" new file mode 100644--- /dev/null +++ "b/log_node31.txt" @@ -0,0 +1,29471 @@ ++ echo Logging output to /data_2/output/LM/scripts/modellink/qwen25/finetune_qwen25_14b_intern_300m_ptd_tp8pp1cp2_stage3.sh/20241127_204213//log_node31.txt +Logging output to /data_2/output/LM/scripts/modellink/qwen25/finetune_qwen25_14b_intern_300m_ptd_tp8pp1cp2_stage3.sh/20241127_204213//log_node31.txt ++ export ASCEND_PROCESS_LOG_PATH=/data_2/output/LM/scripts/modellink/qwen25/finetune_qwen25_14b_intern_300m_ptd_tp8pp1cp2_stage3.sh/20241127_204213//ascend/31 ++ ASCEND_PROCESS_LOG_PATH=/data_2/output/LM/scripts/modellink/qwen25/finetune_qwen25_14b_intern_300m_ptd_tp8pp1cp2_stage3.sh/20241127_204213//ascend/31 ++ mkdir -p /data_2/output/LM/scripts/modellink/qwen25/finetune_qwen25_14b_intern_300m_ptd_tp8pp1cp2_stage3.sh/20241127_204213//ascend/31 ++ DATA_PATH=/local_disk/cognitron_vl//configs/lcvlm_finetune_stage3.yaml ++ TOKENIZER_PATH=/data_4/models/Qwen/Qwen2.5-14B-Instruct/ ++ CKPT_LOAD_DIR=/data_2/output/LM/lcvlm_modellink/scripts/qwen25/finetune_qwen25_14b_intern_300m_ptd_tp8pp1_stage2.sh/20241014_131952/ ++ VIT_CKPT_LOAD_DIR=/ ++ CKPT_SAVE_DIR=/data_2/output/LM/scripts/modellink/qwen25/finetune_qwen25_14b_intern_300m_ptd_tp8pp1cp2_stage3.sh/20241127_204213// ++ rsync -avh /local_disk/cognitron_vl//configs/lcvlm_finetune_stage3.yaml /data_2/output/LM/scripts/modellink/qwen25/finetune_qwen25_14b_intern_300m_ptd_tp8pp1cp2_stage3.sh/20241127_204213/ +sending incremental file list + +sent 71 bytes received 12 bytes 166.00 bytes/sec +total size is 23.84K speedup is 287.17 ++ cd /local_disk/cognitron_vl/ ++ rm -fr datasets ++ mkdir -p datasets ++ ln -s /data/data/ datasets/CV ++ ln -s /data/data/LLM datasets/LLM ++ ln -s /data/data/LMM datasets/LMM ++ source /local_disk/cognitron_vl//scripts/set_env_mg_npu.sh +++ source /usr/local/Ascend/driver/bin/setenv.bash ++++ DEP_INFO_FILE=/etc/ascend_install.info ++++ [[ -f /etc/ascend_install.info ]] ++++ . /etc/ascend_install.info ++++ DRV_LIB64_COMMON_LDPATH=/driver/lib64/common ++++ DRV_LIB64_DRV_LDPATH=/driver/lib64/driver ++++ DRV_LIB64_LDPATH=/driver/lib64 ++++ export LD_LIBRARY_PATH=/driver/lib64/common:/driver/lib64/driver:/driver/lib64:/usr/local/Ascend/ascend-toolkit/latest/tools/aml/lib64:/usr/local/Ascend/ascend-toolkit/latest/tools/aml/lib64/plugin:/usr/local/Ascend/ascend-toolkit/latest/lib64:/usr/local/Ascend/ascend-toolkit/latest/lib64/plugin/opskernel:/usr/local/Ascend/ascend-toolkit/latest/lib64/plugin/nnengine:/usr/local/Ascend/ascend-toolkit/latest/opp/built-in/op_impl/ai_core/tbe/op_tiling/lib/linux/x86_64:/usr/local/Ascend/driver/lib64:/usr/local/Ascend/driver/lib64/common:/usr/local/Ascend/driver/lib64/driver:/usr/local/Ascend/driver/lib64/common:/usr/local/Ascend/driver/lib64/driver:/usr/lib/x86_64-linux-gnu/hdf5/serial: ++++ LD_LIBRARY_PATH=/driver/lib64/common:/driver/lib64/driver:/driver/lib64:/usr/local/Ascend/ascend-toolkit/latest/tools/aml/lib64:/usr/local/Ascend/ascend-toolkit/latest/tools/aml/lib64/plugin:/usr/local/Ascend/ascend-toolkit/latest/lib64:/usr/local/Ascend/ascend-toolkit/latest/lib64/plugin/opskernel:/usr/local/Ascend/ascend-toolkit/latest/lib64/plugin/nnengine:/usr/local/Ascend/ascend-toolkit/latest/opp/built-in/op_impl/ai_core/tbe/op_tiling/lib/linux/x86_64:/usr/local/Ascend/driver/lib64:/usr/local/Ascend/driver/lib64/common:/usr/local/Ascend/driver/lib64/driver:/usr/local/Ascend/driver/lib64/common:/usr/local/Ascend/driver/lib64/driver:/usr/lib/x86_64-linux-gnu/hdf5/serial: ++++ export PATH=/usr/local/Ascend/ascend-toolkit/latest/bin:/usr/local/Ascend/ascend-toolkit/latest/compiler/ccec_compiler/bin:/usr/local/Ascend/ascend-toolkit/latest/tools/ccec_compiler/bin:/root/miniconda3/envs/py38/bin:/root/miniconda3/condabin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/usr/local/sbin:/usr/local/bin ++++ PATH=/usr/local/Ascend/ascend-toolkit/latest/bin:/usr/local/Ascend/ascend-toolkit/latest/compiler/ccec_compiler/bin:/usr/local/Ascend/ascend-toolkit/latest/tools/ccec_compiler/bin:/root/miniconda3/envs/py38/bin:/root/miniconda3/condabin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/usr/local/sbin:/usr/local/bin +++ source /usr/local/Ascend/ascend-toolkit/set_env.sh ++++ export LD_LIBRARY_PATH=/usr/local/Ascend/driver/lib64:/usr/local/Ascend/driver/lib64/common:/usr/local/Ascend/driver/lib64/driver:/driver/lib64/common:/driver/lib64/driver:/driver/lib64:/usr/local/Ascend/ascend-toolkit/latest/tools/aml/lib64:/usr/local/Ascend/ascend-toolkit/latest/tools/aml/lib64/plugin:/usr/local/Ascend/ascend-toolkit/latest/lib64:/usr/local/Ascend/ascend-toolkit/latest/lib64/plugin/opskernel:/usr/local/Ascend/ascend-toolkit/latest/lib64/plugin/nnengine:/usr/local/Ascend/ascend-toolkit/latest/opp/built-in/op_impl/ai_core/tbe/op_tiling/lib/linux/x86_64:/usr/local/Ascend/driver/lib64:/usr/local/Ascend/driver/lib64/common:/usr/local/Ascend/driver/lib64/driver:/usr/local/Ascend/driver/lib64/common:/usr/local/Ascend/driver/lib64/driver:/usr/lib/x86_64-linux-gnu/hdf5/serial: ++++ LD_LIBRARY_PATH=/usr/local/Ascend/driver/lib64:/usr/local/Ascend/driver/lib64/common:/usr/local/Ascend/driver/lib64/driver:/driver/lib64/common:/driver/lib64/driver:/driver/lib64:/usr/local/Ascend/ascend-toolkit/latest/tools/aml/lib64:/usr/local/Ascend/ascend-toolkit/latest/tools/aml/lib64/plugin:/usr/local/Ascend/ascend-toolkit/latest/lib64:/usr/local/Ascend/ascend-toolkit/latest/lib64/plugin/opskernel:/usr/local/Ascend/ascend-toolkit/latest/lib64/plugin/nnengine:/usr/local/Ascend/ascend-toolkit/latest/opp/built-in/op_impl/ai_core/tbe/op_tiling/lib/linux/x86_64:/usr/local/Ascend/driver/lib64:/usr/local/Ascend/driver/lib64/common:/usr/local/Ascend/driver/lib64/driver:/usr/local/Ascend/driver/lib64/common:/usr/local/Ascend/driver/lib64/driver:/usr/lib/x86_64-linux-gnu/hdf5/serial: ++++ export ASCEND_TOOLKIT_HOME=/usr/local/Ascend/ascend-toolkit/latest ++++ ASCEND_TOOLKIT_HOME=/usr/local/Ascend/ascend-toolkit/latest +++++ arch ++++ export LD_LIBRARY_PATH=/usr/local/Ascend/ascend-toolkit/latest/lib64:/usr/local/Ascend/ascend-toolkit/latest/lib64/plugin/opskernel:/usr/local/Ascend/ascend-toolkit/latest/lib64/plugin/nnengine:/usr/local/Ascend/ascend-toolkit/latest/opp/built-in/op_impl/ai_core/tbe/op_tiling/lib/linux/x86_64:/usr/local/Ascend/driver/lib64:/usr/local/Ascend/driver/lib64/common:/usr/local/Ascend/driver/lib64/driver:/driver/lib64/common:/driver/lib64/driver:/driver/lib64:/usr/local/Ascend/ascend-toolkit/latest/tools/aml/lib64:/usr/local/Ascend/ascend-toolkit/latest/tools/aml/lib64/plugin:/usr/local/Ascend/ascend-toolkit/latest/lib64:/usr/local/Ascend/ascend-toolkit/latest/lib64/plugin/opskernel:/usr/local/Ascend/ascend-toolkit/latest/lib64/plugin/nnengine:/usr/local/Ascend/ascend-toolkit/latest/opp/built-in/op_impl/ai_core/tbe/op_tiling/lib/linux/x86_64:/usr/local/Ascend/driver/lib64:/usr/local/Ascend/driver/lib64/common:/usr/local/Ascend/driver/lib64/driver:/usr/local/Ascend/driver/lib64/common:/usr/local/Ascend/driver/lib64/driver:/usr/lib/x86_64-linux-gnu/hdf5/serial: ++++ LD_LIBRARY_PATH=/usr/local/Ascend/ascend-toolkit/latest/lib64:/usr/local/Ascend/ascend-toolkit/latest/lib64/plugin/opskernel:/usr/local/Ascend/ascend-toolkit/latest/lib64/plugin/nnengine:/usr/local/Ascend/ascend-toolkit/latest/opp/built-in/op_impl/ai_core/tbe/op_tiling/lib/linux/x86_64:/usr/local/Ascend/driver/lib64:/usr/local/Ascend/driver/lib64/common:/usr/local/Ascend/driver/lib64/driver:/driver/lib64/common:/driver/lib64/driver:/driver/lib64:/usr/local/Ascend/ascend-toolkit/latest/tools/aml/lib64:/usr/local/Ascend/ascend-toolkit/latest/tools/aml/lib64/plugin:/usr/local/Ascend/ascend-toolkit/latest/lib64:/usr/local/Ascend/ascend-toolkit/latest/lib64/plugin/opskernel:/usr/local/Ascend/ascend-toolkit/latest/lib64/plugin/nnengine:/usr/local/Ascend/ascend-toolkit/latest/opp/built-in/op_impl/ai_core/tbe/op_tiling/lib/linux/x86_64:/usr/local/Ascend/driver/lib64:/usr/local/Ascend/driver/lib64/common:/usr/local/Ascend/driver/lib64/driver:/usr/local/Ascend/driver/lib64/common:/usr/local/Ascend/driver/lib64/driver:/usr/lib/x86_64-linux-gnu/hdf5/serial: ++++ export LD_LIBRARY_PATH=/usr/local/Ascend/ascend-toolkit/latest/tools/aml/lib64:/usr/local/Ascend/ascend-toolkit/latest/tools/aml/lib64/plugin:/usr/local/Ascend/ascend-toolkit/latest/lib64:/usr/local/Ascend/ascend-toolkit/latest/lib64/plugin/opskernel:/usr/local/Ascend/ascend-toolkit/latest/lib64/plugin/nnengine:/usr/local/Ascend/ascend-toolkit/latest/opp/built-in/op_impl/ai_core/tbe/op_tiling/lib/linux/x86_64:/usr/local/Ascend/driver/lib64:/usr/local/Ascend/driver/lib64/common:/usr/local/Ascend/driver/lib64/driver:/driver/lib64/common:/driver/lib64/driver:/driver/lib64:/usr/local/Ascend/ascend-toolkit/latest/tools/aml/lib64:/usr/local/Ascend/ascend-toolkit/latest/tools/aml/lib64/plugin:/usr/local/Ascend/ascend-toolkit/latest/lib64:/usr/local/Ascend/ascend-toolkit/latest/lib64/plugin/opskernel:/usr/local/Ascend/ascend-toolkit/latest/lib64/plugin/nnengine:/usr/local/Ascend/ascend-toolkit/latest/opp/built-in/op_impl/ai_core/tbe/op_tiling/lib/linux/x86_64:/usr/local/Ascend/driver/lib64:/usr/local/Ascend/driver/lib64/common:/usr/local/Ascend/driver/lib64/driver:/usr/local/Ascend/driver/lib64/common:/usr/local/Ascend/driver/lib64/driver:/usr/lib/x86_64-linux-gnu/hdf5/serial: ++++ LD_LIBRARY_PATH=/usr/local/Ascend/ascend-toolkit/latest/tools/aml/lib64:/usr/local/Ascend/ascend-toolkit/latest/tools/aml/lib64/plugin:/usr/local/Ascend/ascend-toolkit/latest/lib64:/usr/local/Ascend/ascend-toolkit/latest/lib64/plugin/opskernel:/usr/local/Ascend/ascend-toolkit/latest/lib64/plugin/nnengine:/usr/local/Ascend/ascend-toolkit/latest/opp/built-in/op_impl/ai_core/tbe/op_tiling/lib/linux/x86_64:/usr/local/Ascend/driver/lib64:/usr/local/Ascend/driver/lib64/common:/usr/local/Ascend/driver/lib64/driver:/driver/lib64/common:/driver/lib64/driver:/driver/lib64:/usr/local/Ascend/ascend-toolkit/latest/tools/aml/lib64:/usr/local/Ascend/ascend-toolkit/latest/tools/aml/lib64/plugin:/usr/local/Ascend/ascend-toolkit/latest/lib64:/usr/local/Ascend/ascend-toolkit/latest/lib64/plugin/opskernel:/usr/local/Ascend/ascend-toolkit/latest/lib64/plugin/nnengine:/usr/local/Ascend/ascend-toolkit/latest/opp/built-in/op_impl/ai_core/tbe/op_tiling/lib/linux/x86_64:/usr/local/Ascend/driver/lib64:/usr/local/Ascend/driver/lib64/common:/usr/local/Ascend/driver/lib64/driver:/usr/local/Ascend/driver/lib64/common:/usr/local/Ascend/driver/lib64/driver:/usr/lib/x86_64-linux-gnu/hdf5/serial: ++++ export PYTHONPATH=/usr/local/Ascend/ascend-toolkit/latest/python/site-packages:/usr/local/Ascend/ascend-toolkit/latest/opp/built-in/op_impl/ai_core/tbe:/usr/local/Ascend/ascend-toolkit/latest/python/site-packages:/usr/local/Ascend/ascend-toolkit/latest/opp/built-in/op_impl/ai_core/tbe: ++++ PYTHONPATH=/usr/local/Ascend/ascend-toolkit/latest/python/site-packages:/usr/local/Ascend/ascend-toolkit/latest/opp/built-in/op_impl/ai_core/tbe:/usr/local/Ascend/ascend-toolkit/latest/python/site-packages:/usr/local/Ascend/ascend-toolkit/latest/opp/built-in/op_impl/ai_core/tbe: ++++ export PATH=/usr/local/Ascend/ascend-toolkit/latest/bin:/usr/local/Ascend/ascend-toolkit/latest/compiler/ccec_compiler/bin:/usr/local/Ascend/ascend-toolkit/latest/tools/ccec_compiler/bin:/usr/local/Ascend/ascend-toolkit/latest/bin:/usr/local/Ascend/ascend-toolkit/latest/compiler/ccec_compiler/bin:/usr/local/Ascend/ascend-toolkit/latest/tools/ccec_compiler/bin:/root/miniconda3/envs/py38/bin:/root/miniconda3/condabin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/usr/local/sbin:/usr/local/bin ++++ PATH=/usr/local/Ascend/ascend-toolkit/latest/bin:/usr/local/Ascend/ascend-toolkit/latest/compiler/ccec_compiler/bin:/usr/local/Ascend/ascend-toolkit/latest/tools/ccec_compiler/bin:/usr/local/Ascend/ascend-toolkit/latest/bin:/usr/local/Ascend/ascend-toolkit/latest/compiler/ccec_compiler/bin:/usr/local/Ascend/ascend-toolkit/latest/tools/ccec_compiler/bin:/root/miniconda3/envs/py38/bin:/root/miniconda3/condabin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/usr/local/sbin:/usr/local/bin ++++ export ASCEND_AICPU_PATH=/usr/local/Ascend/ascend-toolkit/latest ++++ ASCEND_AICPU_PATH=/usr/local/Ascend/ascend-toolkit/latest ++++ export ASCEND_OPP_PATH=/usr/local/Ascend/ascend-toolkit/latest/opp ++++ ASCEND_OPP_PATH=/usr/local/Ascend/ascend-toolkit/latest/opp ++++ export TOOLCHAIN_HOME=/usr/local/Ascend/ascend-toolkit/latest/toolkit ++++ TOOLCHAIN_HOME=/usr/local/Ascend/ascend-toolkit/latest/toolkit ++++ export ASCEND_HOME_PATH=/usr/local/Ascend/ascend-toolkit/latest ++++ ASCEND_HOME_PATH=/usr/local/Ascend/ascend-toolkit/latest +++ export HCCL_CONNECT_TIMEOUT=7200 +++ HCCL_CONNECT_TIMEOUT=7200 +++ export HCCL_EXEC_TIMEOUT=7200 +++ HCCL_EXEC_TIMEOUT=7200 +++ export COMBINED_ENABLE=1 +++ COMBINED_ENABLE=1 +++ export MULTI_STREAM_MEMORY_REUSE=1 +++ MULTI_STREAM_MEMORY_REUSE=1 +++ export HCCL_RDMA_TC=160 +++ HCCL_RDMA_TC=160 +++ export HCCL_RDMA_SL=5 +++ HCCL_RDMA_SL=5 +++ export HCCL_INTRA_PCIE_ENABLE=0 +++ HCCL_INTRA_PCIE_ENABLE=0 +++ export HCCL_INTRA_ROCE_ENABLE=1 +++ HCCL_INTRA_ROCE_ENABLE=1 +++ export HCCL_RDMA_TIMEOUT=20 +++ HCCL_RDMA_TIMEOUT=20 +++ export INF_NAN_MODE_ENABLE=1 +++ INF_NAN_MODE_ENABLE=1 +++ export DISTRIBUTED_BACKEND=hccl +++ DISTRIBUTED_BACKEND=hccl +++ export ASCEND_LAUNCH_BLOCKING=0 +++ ASCEND_LAUNCH_BLOCKING=0 +++ export ASCEND_SLOG_PRINT_TO_STDOUT=0 +++ ASCEND_SLOG_PRINT_TO_STDOUT=0 +++ export ASCEND_GLOBAL_LOG_LEVEL=3 +++ ASCEND_GLOBAL_LOG_LEVEL=3 +++ export ASCEND_GLOBAL_EVENT_ENABLE=0 +++ ASCEND_GLOBAL_EVENT_ENABLE=0 +++ export TASK_QUEUE_ENABLE=1 +++ TASK_QUEUE_ENABLE=1 +++ export PTCOPY_ENABLE=1 +++ PTCOPY_ENABLE=1 +++ export COMBINED_ENABLE=1 +++ COMBINED_ENABLE=1 +++ export DYNAMIC_OP=ADD#MUL +++ DYNAMIC_OP=ADD#MUL +++ export HCCL_WHITELIST_DISABLE=1 +++ HCCL_WHITELIST_DISABLE=1 +++ export HCCL_CONNECT_TIMEOUT=7200 +++ HCCL_CONNECT_TIMEOUT=7200 +++ export HCCL_WHITELIST_DISABLE=1 +++ HCCL_WHITELIST_DISABLE=1 +++ export CUDA_DEVICE_MAX_CONNECTIONS=1 +++ CUDA_DEVICE_MAX_CONNECTIONS=1 +++ pip3 install --no-index --find-links=/data/software/ -r requirements_npu.txt +Looking in links: /data/software/ +Processing data/software/expecttest-0.2.1-py3-none-any.whl (from -r requirements_npu.txt (line 1)) +Requirement already satisfied: peft in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from -r requirements_npu.txt (line 2)) (0.7.0) +Processing data/software/XlsxWriter-3.2.0-py3-none-any.whl (from -r requirements_npu.txt (line 3)) +Requirement already satisfied: termcolor in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from -r requirements_npu.txt (line 4)) (2.4.0) +Requirement already satisfied: tabulate in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from -r requirements_npu.txt (line 5)) (0.9.0) +Processing data/software/tiktoken-0.7.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (from -r requirements_npu.txt (line 6)) +Requirement already satisfied: matplotlib in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from -r requirements_npu.txt (line 7)) (3.7.5) +Processing data/software/datasets-3.0.0-py3-none-any.whl (from -r requirements_npu.txt (line 8)) +Requirement already satisfied: einops in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from -r requirements_npu.txt (line 9)) (0.7.0) +Processing data/software/pybind11-2.13.6-py3-none-any.whl (from -r requirements_npu.txt (line 10)) +Requirement already satisfied: tensorboardX in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from -r requirements_npu.txt (line 11)) (2.6.2.2) +Processing data/software/pyarrow-17.0.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (from -r requirements_npu.txt (line 12)) +Requirement already satisfied: transformers>=4.40.1 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from -r requirements_npu.txt (line 13)) (4.40.1) +Requirement already satisfied: deepspeed>=0.14.2 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from -r requirements_npu.txt (line 14)) (0.14.5) +Processing data/software/accelerate-0.34.2-py3-none-any.whl (from -r requirements_npu.txt (line 15)) +Requirement already satisfied: timm in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from -r requirements_npu.txt (line 16)) (0.9.16) +Processing data/software/flask-3.0.3-py3-none-any.whl (from -r requirements_npu.txt (line 17)) +Processing data/software/Flask_RESTful-0.3.10-py2.py3-none-any.whl (from -r requirements_npu.txt (line 18)) +Processing data/software/decord-0.6.0-py3-none-manylinux2010_x86_64.whl (from -r requirements_npu.txt (line 19)) +Processing data/software/natsort-8.4.0-py3-none-any.whl (from -r requirements_npu.txt (line 20)) +Requirement already satisfied: numpy>=1.17 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from peft->-r requirements_npu.txt (line 2)) (1.24.4) +Requirement already satisfied: packaging>=20.0 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from peft->-r requirements_npu.txt (line 2)) (23.2) +Requirement already satisfied: psutil in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from peft->-r requirements_npu.txt (line 2)) (5.9.8) +Requirement already satisfied: pyyaml in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from peft->-r requirements_npu.txt (line 2)) (5.4.1) +Requirement already satisfied: torch>=1.13.0 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from peft->-r requirements_npu.txt (line 2)) (2.1.0+cpu) +Requirement already satisfied: tqdm in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from peft->-r requirements_npu.txt (line 2)) (4.66.2) +Requirement already satisfied: safetensors in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from peft->-r requirements_npu.txt (line 2)) (0.4.2) +Requirement already satisfied: huggingface-hub>=0.17.0 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from peft->-r requirements_npu.txt (line 2)) (0.20.3) +Requirement already satisfied: regex>=2022.1.18 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from tiktoken->-r requirements_npu.txt (line 6)) (2023.12.25) +Requirement already satisfied: requests>=2.26.0 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from tiktoken->-r requirements_npu.txt (line 6)) (2.31.0) +Requirement already satisfied: contourpy>=1.0.1 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from matplotlib->-r requirements_npu.txt (line 7)) (1.1.1) +Requirement already satisfied: cycler>=0.10 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from matplotlib->-r requirements_npu.txt (line 7)) (0.12.1) +Requirement already satisfied: fonttools>=4.22.0 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from matplotlib->-r requirements_npu.txt (line 7)) (4.49.0) +Requirement already satisfied: kiwisolver>=1.0.1 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from matplotlib->-r requirements_npu.txt (line 7)) (1.4.5) +Requirement already satisfied: pillow>=6.2.0 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from matplotlib->-r requirements_npu.txt (line 7)) (10.2.0) +Requirement already satisfied: pyparsing>=2.3.1 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from matplotlib->-r requirements_npu.txt (line 7)) (3.1.1) +Requirement already satisfied: python-dateutil>=2.7 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from matplotlib->-r requirements_npu.txt (line 7)) (2.8.2) +Requirement already satisfied: importlib-resources>=3.2.0 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from matplotlib->-r requirements_npu.txt (line 7)) (6.1.2) +Requirement already satisfied: filelock in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from datasets>=2.21.0->-r requirements_npu.txt (line 8)) (3.13.1) +Requirement already satisfied: dill<0.3.9,>=0.3.0 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from datasets>=2.21.0->-r requirements_npu.txt (line 8)) (0.3.7) +Requirement already satisfied: pandas in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from datasets>=2.21.0->-r requirements_npu.txt (line 8)) (2.0.3) +Processing data/software/requests-2.32.3-py3-none-any.whl (from tiktoken->-r requirements_npu.txt (line 6)) +Processing data/software/tqdm-4.67.1-py3-none-any.whl (from peft->-r requirements_npu.txt (line 2)) +Requirement already satisfied: xxhash in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from datasets>=2.21.0->-r requirements_npu.txt (line 8)) (3.4.1) +Requirement already satisfied: multiprocess in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from datasets>=2.21.0->-r requirements_npu.txt (line 8)) (0.70.15) +Requirement already satisfied: fsspec<=2024.6.1,>=2023.1.0 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from fsspec[http]<=2024.6.1,>=2023.1.0->datasets>=2.21.0->-r requirements_npu.txt (line 8)) (2023.10.0) +Requirement already satisfied: aiohttp in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from datasets>=2.21.0->-r requirements_npu.txt (line 8)) (3.9.3) +Processing data/software/huggingface_hub-0.26.2-py3-none-any.whl (from peft->-r requirements_npu.txt (line 2)) +Requirement already satisfied: protobuf>=3.20 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from tensorboardX->-r requirements_npu.txt (line 11)) (4.25.3) +Requirement already satisfied: tokenizers<0.20,>=0.19 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from transformers>=4.40.1->-r requirements_npu.txt (line 13)) (0.19.1) +Requirement already satisfied: hjson in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from deepspeed>=0.14.2->-r requirements_npu.txt (line 14)) (3.1.0) +Requirement already satisfied: ninja in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from deepspeed>=0.14.2->-r requirements_npu.txt (line 14)) (1.11.1.1) +Requirement already satisfied: nvidia-ml-py in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from deepspeed>=0.14.2->-r requirements_npu.txt (line 14)) (12.560.30) +Requirement already satisfied: py-cpuinfo in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from deepspeed>=0.14.2->-r requirements_npu.txt (line 14)) (9.0.0) +Requirement already satisfied: pydantic in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from deepspeed>=0.14.2->-r requirements_npu.txt (line 14)) (1.10.15) +Processing data/software/safetensors-0.4.5-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (from peft->-r requirements_npu.txt (line 2)) +Requirement already satisfied: torchvision in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from timm->-r requirements_npu.txt (line 16)) (0.16.0) +Requirement already satisfied: Werkzeug>=3.0.0 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from flask->-r requirements_npu.txt (line 17)) (3.0.1) +Requirement already satisfied: Jinja2>=3.1.2 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from flask->-r requirements_npu.txt (line 17)) (3.1.3) +Processing data/software/itsdangerous-2.2.0-py3-none-any.whl (from flask->-r requirements_npu.txt (line 17)) +Requirement already satisfied: click>=8.1.3 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from flask->-r requirements_npu.txt (line 17)) (8.1.7) +Processing data/software/blinker-1.8.2-py3-none-any.whl (from flask->-r requirements_npu.txt (line 17)) +Requirement already satisfied: importlib-metadata>=3.6.0 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from flask->-r requirements_npu.txt (line 17)) (7.0.1) +Processing data/software/aniso8601-9.0.1-py2.py3-none-any.whl (from flask_restful->-r requirements_npu.txt (line 18)) +Requirement already satisfied: six>=1.3.0 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from flask_restful->-r requirements_npu.txt (line 18)) (1.16.0) +Requirement already satisfied: pytz in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from flask_restful->-r requirements_npu.txt (line 18)) (2024.1) +Requirement already satisfied: aiosignal>=1.1.2 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from aiohttp->datasets>=2.21.0->-r requirements_npu.txt (line 8)) (1.3.1) +Requirement already satisfied: attrs>=17.3.0 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from aiohttp->datasets>=2.21.0->-r requirements_npu.txt (line 8)) (23.2.0) +Requirement already satisfied: frozenlist>=1.1.1 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from aiohttp->datasets>=2.21.0->-r requirements_npu.txt (line 8)) (1.4.1) +Requirement already satisfied: multidict<7.0,>=4.5 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from aiohttp->datasets>=2.21.0->-r requirements_npu.txt (line 8)) (6.0.5) +Requirement already satisfied: yarl<2.0,>=1.0 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from aiohttp->datasets>=2.21.0->-r requirements_npu.txt (line 8)) (1.9.4) +Requirement already satisfied: async-timeout<5.0,>=4.0 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from aiohttp->datasets>=2.21.0->-r requirements_npu.txt (line 8)) (4.0.3) +Requirement already satisfied: typing-extensions>=3.7.4.3 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from huggingface-hub>=0.17.0->peft->-r requirements_npu.txt (line 2)) (4.10.0) +Requirement already satisfied: zipp>=0.5 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from importlib-metadata>=3.6.0->flask->-r requirements_npu.txt (line 17)) (3.17.0) +Requirement already satisfied: MarkupSafe>=2.0 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from Jinja2>=3.1.2->flask->-r requirements_npu.txt (line 17)) (2.1.5) +Requirement already satisfied: charset-normalizer<4,>=2 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from requests>=2.26.0->tiktoken->-r requirements_npu.txt (line 6)) (3.3.2) +Requirement already satisfied: idna<4,>=2.5 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from requests>=2.26.0->tiktoken->-r requirements_npu.txt (line 6)) (3.6) +Requirement already satisfied: urllib3<3,>=1.21.1 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from requests>=2.26.0->tiktoken->-r requirements_npu.txt (line 6)) (1.26.18) +Requirement already satisfied: certifi>=2017.4.17 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from requests>=2.26.0->tiktoken->-r requirements_npu.txt (line 6)) (2024.2.2) +Requirement already satisfied: sympy in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from torch>=1.13.0->peft->-r requirements_npu.txt (line 2)) (1.4) +Requirement already satisfied: networkx in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from torch>=1.13.0->peft->-r requirements_npu.txt (line 2)) (3.1) +Requirement already satisfied: tzdata>=2022.1 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from pandas->datasets>=2.21.0->-r requirements_npu.txt (line 8)) (2024.1) +Requirement already satisfied: mpmath>=0.19 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from sympy->torch>=1.13.0->peft->-r requirements_npu.txt (line 2)) (1.3.0) +DEPRECATION: apex 0.1-ascend-20240523 has a non-standard version number. pip 24.1 will enforce this behaviour change. A possible replacement is to upgrade to a newer version of apex or contact the author to suggest that they release a version with a conforming version number. Discussion can be found at https://github.com/pypa/pip/issues/12063 +Installing collected packages: aniso8601, xlsxwriter, tqdm, safetensors, requests, pybind11, pyarrow, natsort, itsdangerous, expecttest, decord, blinker, tiktoken, huggingface-hub, flask, flask_restful, accelerate, datasets + Attempting uninstall: tqdm + Found existing installation: tqdm 4.66.2 + Uninstalling tqdm-4.66.2: + Successfully uninstalled tqdm-4.66.2 + Attempting uninstall: safetensors + Found existing installation: safetensors 0.4.2 + Uninstalling safetensors-0.4.2: + Successfully uninstalled safetensors-0.4.2 + Attempting uninstall: requests + Found existing installation: requests 2.31.0 + Uninstalling requests-2.31.0: + Successfully uninstalled requests-2.31.0 + Attempting uninstall: pyarrow + Found existing installation: pyarrow 15.0.0 + Uninstalling pyarrow-15.0.0: + Successfully uninstalled pyarrow-15.0.0 + Attempting uninstall: huggingface-hub + Found existing installation: huggingface-hub 0.20.3 + Uninstalling huggingface-hub-0.20.3: + Successfully uninstalled huggingface-hub-0.20.3 + Attempting uninstall: accelerate + Found existing installation: accelerate 0.25.0 + Uninstalling accelerate-0.25.0: + Successfully uninstalled accelerate-0.25.0 + Attempting uninstall: datasets + Found existing installation: datasets 2.16.0 + Uninstalling datasets-2.16.0: + Successfully uninstalled datasets-2.16.0 +ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts. +tikit 1.8.2.240926 requires dicttoxml==1.7.4, which is not installed. +tikit 1.8.2.240926 requires docopt==0.6.2, which is not installed. +tikit 1.8.2.240926 requires future==0.18.2, which is not installed. +tikit 1.8.2.240926 requires hdfs==2.6.0, which is not installed. +tikit 1.8.2.240926 requires pure-sasl==0.6.2, which is not installed. +tikit 1.8.2.240926 requires py4j==0.10.7, which is not installed. +tikit 1.8.2.240926 requires PyHive[hive]==0.6.4, which is not installed. +tikit 1.8.2.240926 requires pyjwt>=2.4.0, which is not installed. +tikit 1.8.2.240926 requires requests-kerberos>=0.14.0, which is not installed. +tikit 1.8.2.240926 requires sasl==0.3.1, which is not installed. +tikit 1.8.2.240926 requires thrift==0.15.0, which is not installed. +tikit 1.8.2.240926 requires thrift-sasl>=0.1.0, which is not installed. +tikit 1.8.2.240926 requires certifi==2021.10.8, but you have certifi 2024.2.2 which is incompatible. +tikit 1.8.2.240926 requires cos-python-sdk-v5==1.9.29, but you have cos-python-sdk-v5 1.9.26 which is incompatible. +tikit 1.8.2.240926 requires idna==3.3, but you have idna 3.6 which is incompatible. +tikit 1.8.2.240926 requires prettytable==2.5.0, but you have prettytable 3.11.0 which is incompatible. +tikit 1.8.2.240926 requires urllib3==1.26.7, but you have urllib3 1.26.18 which is incompatible. +tikit 1.8.2.240926 requires wcwidth==0.2.5, but you have wcwidth 0.2.13 which is incompatible. +Successfully installed accelerate-0.34.2 aniso8601-9.0.1 blinker-1.8.2 datasets-3.0.0 decord-0.6.0 expecttest-0.2.1 flask-3.0.3 flask_restful-0.3.10 huggingface-hub-0.26.2 itsdangerous-2.2.0 natsort-8.4.0 pyarrow-17.0.0 pybind11-2.13.6 requests-2.32.3 safetensors-0.4.5 tiktoken-0.7.0 tqdm-4.67.1 xlsxwriter-3.2.0 +WARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv +++ return 0 ++ MEGATRON_DIR=/local_disk/cognitron_vl//third_party/Megatron-LM_core_r0.6.0/ ++ MINDSPEED_DIR=/local_disk/cognitron_vl//third_party/MindSpeed_core_r0.6.0/ ++ MODELLINK_DIR=/local_disk/cognitron_vl//third_party/ModelLink/ ++ pip3 install --no-index --find-links=/data/software/ -e /local_disk/cognitron_vl//third_party/Megatron-LM_core_r0.6.0/ +Looking in links: /data/software/ +Obtaining file://local_disk/cognitron_vl/third_party/Megatron-LM_core_r0.6.0 + Installing build dependencies: started + Installing build dependencies: finished with status 'done' + Checking if build backend supports build_editable: started + Checking if build backend supports build_editable: finished with status 'done' + Getting requirements to build editable: started + Getting requirements to build editable: finished with status 'done' + Installing backend dependencies: started + Installing backend dependencies: finished with status 'done' + Preparing editable metadata (pyproject.toml): started + Preparing editable metadata (pyproject.toml): finished with status 'done' +Building wheels for collected packages: megatron_core + Building editable for megatron_core (pyproject.toml): started + Building editable for megatron_core (pyproject.toml): finished with status 'done' + Created wheel for megatron_core: filename=megatron_core-0.6.0-0.editable-cp38-cp38-linux_x86_64.whl size=8791 sha256=06d5bd071b6eadb2bc6965a495bd802172dae415af74dd60b1478328d6910bcd + Stored in directory: /tmp/pip-ephem-wheel-cache-wolh2e_3/wheels/54/9c/d1/d2015aa0c34e791e64d65d19395e5a9a5528f0c63fd519b9ff +Successfully built megatron_core +DEPRECATION: apex 0.1-ascend-20240523 has a non-standard version number. pip 24.1 will enforce this behaviour change. A possible replacement is to upgrade to a newer version of apex or contact the author to suggest that they release a version with a conforming version number. Discussion can be found at https://github.com/pypa/pip/issues/12063 +Installing collected packages: megatron_core +Successfully installed megatron_core-0.6.0 +WARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv ++ pip3 install --no-index --find-links=/data/software/ -e /local_disk/cognitron_vl//third_party/MindSpeed_core_r0.6.0/ +Looking in links: /data/software/ +Obtaining file://local_disk/cognitron_vl/third_party/MindSpeed_core_r0.6.0 + Preparing metadata (setup.py): started + Preparing metadata (setup.py): finished with status 'done' +WARNING: Error parsing requirements for tokenizers: [Errno 2] No such file or directory: '/root/miniconda3/envs/py38/lib/python3.8/site-packages/tokenizers-0.19.1.dist-info/METADATA' +WARNING: Error parsing requirements for transformers: [Errno 2] No such file or directory: '/root/miniconda3/envs/py38/lib/python3.8/site-packages/transformers-4.40.1.dist-info/METADATA' +DEPRECATION: apex 0.1-ascend-20240523 has a non-standard version number. pip 24.1 will enforce this behaviour change. A possible replacement is to upgrade to a newer version of apex or contact the author to suggest that they release a version with a conforming version number. Discussion can be found at https://github.com/pypa/pip/issues/12063 +Installing collected packages: mindspeed + Running setup.py develop for mindspeed +Successfully installed mindspeed-0.6.0 +WARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv ++ pip3 install --no-index --find-links=/data/software/ -e /local_disk/cognitron_vl//third_party/ModelLink/ +Looking in links: /data/software/ +Obtaining file://local_disk/cognitron_vl/third_party/ModelLink + Preparing metadata (setup.py): started + Preparing metadata (setup.py): finished with status 'done' +Requirement already satisfied: numpy in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from modellink==0.0.1) (1.24.4) +Processing data/software/transformers-4.43.2-py3-none-any.whl (from modellink==0.0.1) +Processing data/software/transformers-stream-generator-0.0.5.tar.gz (from modellink==0.0.1) + Preparing metadata (setup.py): started + Preparing metadata (setup.py): finished with status 'done' +Requirement already satisfied: sympy in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from modellink==0.0.1) (1.4) +Requirement already satisfied: decorator in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from modellink==0.0.1) (5.1.1) +Requirement already satisfied: scipy in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from modellink==0.0.1) (1.10.1) +Requirement already satisfied: sentencepiece in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from modellink==0.0.1) (0.2.0) +Requirement already satisfied: einops in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from modellink==0.0.1) (0.7.0) +Requirement already satisfied: datasets in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from modellink==0.0.1) (3.0.0) +Requirement already satisfied: pybind11 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from modellink==0.0.1) (2.13.6) +Requirement already satisfied: accelerate in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from modellink==0.0.1) (0.34.2) +Requirement already satisfied: six in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from modellink==0.0.1) (1.16.0) +Requirement already satisfied: protobuf in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from modellink==0.0.1) (4.25.3) +Processing data/software/peft-0.7.1-py3-none-any.whl (from modellink==0.0.1) +Requirement already satisfied: tiktoken in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from modellink==0.0.1) (0.7.0) +Requirement already satisfied: packaging>=20.0 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from peft==0.7.1->modellink==0.0.1) (23.2) +Requirement already satisfied: psutil in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from peft==0.7.1->modellink==0.0.1) (5.9.8) +Requirement already satisfied: pyyaml in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from peft==0.7.1->modellink==0.0.1) (5.4.1) +Requirement already satisfied: torch>=1.13.0 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from peft==0.7.1->modellink==0.0.1) (2.1.0+cpu) +Requirement already satisfied: tqdm in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from peft==0.7.1->modellink==0.0.1) (4.67.1) +Requirement already satisfied: safetensors in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from peft==0.7.1->modellink==0.0.1) (0.4.5) +Requirement already satisfied: huggingface-hub>=0.17.0 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from peft==0.7.1->modellink==0.0.1) (0.26.2) +Requirement already satisfied: filelock in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from transformers==4.43.2->modellink==0.0.1) (3.13.1) +Requirement already satisfied: regex!=2019.12.17 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from transformers==4.43.2->modellink==0.0.1) (2023.12.25) +Requirement already satisfied: requests in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from transformers==4.43.2->modellink==0.0.1) (2.32.3) +Processing data/software/tokenizers-0.19.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (from transformers==4.43.2->modellink==0.0.1) +Requirement already satisfied: pyarrow>=15.0.0 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from datasets->modellink==0.0.1) (17.0.0) +Requirement already satisfied: dill<0.3.9,>=0.3.0 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from datasets->modellink==0.0.1) (0.3.7) +Requirement already satisfied: pandas in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from datasets->modellink==0.0.1) (2.0.3) +Requirement already satisfied: xxhash in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from datasets->modellink==0.0.1) (3.4.1) +Requirement already satisfied: multiprocess in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from datasets->modellink==0.0.1) (0.70.15) +Requirement already satisfied: fsspec<=2024.6.1,>=2023.1.0 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from fsspec[http]<=2024.6.1,>=2023.1.0->datasets->modellink==0.0.1) (2023.10.0) +Requirement already satisfied: aiohttp in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from datasets->modellink==0.0.1) (3.9.3) +Requirement already satisfied: mpmath>=0.19 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from sympy->modellink==0.0.1) (1.3.0) +Requirement already satisfied: aiosignal>=1.1.2 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from aiohttp->datasets->modellink==0.0.1) (1.3.1) +Requirement already satisfied: attrs>=17.3.0 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from aiohttp->datasets->modellink==0.0.1) (23.2.0) +Requirement already satisfied: frozenlist>=1.1.1 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from aiohttp->datasets->modellink==0.0.1) (1.4.1) +Requirement already satisfied: multidict<7.0,>=4.5 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from aiohttp->datasets->modellink==0.0.1) (6.0.5) +Requirement already satisfied: yarl<2.0,>=1.0 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from aiohttp->datasets->modellink==0.0.1) (1.9.4) +Requirement already satisfied: async-timeout<5.0,>=4.0 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from aiohttp->datasets->modellink==0.0.1) (4.0.3) +Requirement already satisfied: typing-extensions>=3.7.4.3 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from huggingface-hub>=0.17.0->peft==0.7.1->modellink==0.0.1) (4.10.0) +Requirement already satisfied: charset-normalizer<4,>=2 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from requests->transformers==4.43.2->modellink==0.0.1) (3.3.2) +Requirement already satisfied: idna<4,>=2.5 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from requests->transformers==4.43.2->modellink==0.0.1) (3.6) +Requirement already satisfied: urllib3<3,>=1.21.1 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from requests->transformers==4.43.2->modellink==0.0.1) (1.26.18) +Requirement already satisfied: certifi>=2017.4.17 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from requests->transformers==4.43.2->modellink==0.0.1) (2024.2.2) +Requirement already satisfied: networkx in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from torch>=1.13.0->peft==0.7.1->modellink==0.0.1) (3.1) +Requirement already satisfied: jinja2 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from torch>=1.13.0->peft==0.7.1->modellink==0.0.1) (3.1.3) +Requirement already satisfied: python-dateutil>=2.8.2 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from pandas->datasets->modellink==0.0.1) (2.8.2) +Requirement already satisfied: pytz>=2020.1 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from pandas->datasets->modellink==0.0.1) (2024.1) +Requirement already satisfied: tzdata>=2022.1 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from pandas->datasets->modellink==0.0.1) (2024.1) +Requirement already satisfied: MarkupSafe>=2.0 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from jinja2->torch>=1.13.0->peft==0.7.1->modellink==0.0.1) (2.1.5) +Building wheels for collected packages: transformers_stream_generator + Building wheel for transformers_stream_generator (setup.py): started + Building wheel for transformers_stream_generator (setup.py): finished with status 'done' + Created wheel for transformers_stream_generator: filename=transformers_stream_generator-0.0.5-py3-none-any.whl size=12425 sha256=3ed62a866ab10917ceed94a0bafc0596380802f798ed67b7de78b76fe0b65f1f + Stored in directory: /root/.cache/pip/wheels/56/8c/42/5381d9c36bc85f28982f4cf8f98dc44d37a6d6c04897a5cb7c +Successfully built transformers_stream_generator +DEPRECATION: apex 0.1-ascend-20240523 has a non-standard version number. pip 24.1 will enforce this behaviour change. A possible replacement is to upgrade to a newer version of apex or contact the author to suggest that they release a version with a conforming version number. Discussion can be found at https://github.com/pypa/pip/issues/12063 +Installing collected packages: tokenizers, transformers, transformers_stream_generator, peft, modellink + Attempting uninstall: tokenizers + Found existing installation: tokenizers 0.20.3 + Uninstalling tokenizers-0.20.3: + Successfully uninstalled tokenizers-0.20.3 + Attempting uninstall: transformers + Found existing installation: transformers 4.46.3 + Uninstalling transformers-4.46.3: + Successfully uninstalled transformers-4.46.3 + Attempting uninstall: peft + Found existing installation: peft 0.7.0 + Uninstalling peft-0.7.0: + Successfully uninstalled peft-0.7.0 + Running setup.py develop for modellink +Successfully installed modellink-0.0.1 peft-0.7.1 tokenizers-0.19.1 transformers-4.43.2 transformers_stream_generator-0.0.5 +WARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv ++ export PYTHONPATH=/local_disk/cognitron_vl//third_party/Megatron-LM_core_r0.6.0//:/usr/local/Ascend/ascend-toolkit/latest/python/site-packages:/usr/local/Ascend/ascend-toolkit/latest/opp/built-in/op_impl/ai_core/tbe:/usr/local/Ascend/ascend-toolkit/latest/python/site-packages:/usr/local/Ascend/ascend-toolkit/latest/opp/built-in/op_impl/ai_core/tbe: ++ PYTHONPATH=/local_disk/cognitron_vl//third_party/Megatron-LM_core_r0.6.0//:/usr/local/Ascend/ascend-toolkit/latest/python/site-packages:/usr/local/Ascend/ascend-toolkit/latest/opp/built-in/op_impl/ai_core/tbe:/usr/local/Ascend/ascend-toolkit/latest/python/site-packages:/usr/local/Ascend/ascend-toolkit/latest/opp/built-in/op_impl/ai_core/tbe: ++ GPUS_PER_NODE=16 ++ NNODES=32 ++ NODE_RANK=31 ++ MASTER_PORT=34567 ++ export CUDA_DEVICE_MAX_CONNECTIONS=1 ++ CUDA_DEVICE_MAX_CONNECTIONS=1 ++ VISION_SEQ_LENGTH=1025 ++ IMAGE_TOKEN_LENGTH=256 ++ IMAGE_SIZE=448 ++ VISION_MODEL_TYPE=intern_300m ++ TP=8 ++ PP=1 ++ CP=2 ++ CP_ALGO=megatron_cp_algo ++ CP_MASK=causal ++ DISTRIBUTED_ARGS=' + --nproc_per_node 16 --nnodes 32 --node_rank 31 --master_addr train-1197954740059955456-93njiyzl9b0g-master-0.train-100034032793.svc.cluster.local --master_port 34567 +' ++ GPT_ARGS=' + --use-mcore-models --tensor-model-parallel-size 8 --pipeline-model-parallel-size 1 --context-parallel-size 2 --context-parallel-algo megatron_cp_algo --cp-attention-mask-type causal --use-cp-send-recv-overlap --no-create-attention-mask-in-dataloader --sparse-mode 4 --sequence-parallel --recompute-method block --recompute-granularity full --recompute-num-layers 48 --num-layers 48 --hidden-size 5120 --ffn-hidden-size 13824 --num-attention-heads 40 --group-query-attention --num-query-groups 8 --tokenizer-type PretrainedFromHF --tokenizer-name-or-path /data_4/models/Qwen/Qwen2.5-14B-Instruct/ --seq-length 131072 --max-position-embeddings 131072 --micro-batch-size 1 --global-batch-size 64 --make-vocab-size-divisible-by 1 --padded-vocab-size 152064 --rotary-base 1000000.0 --lr 5.00e-6 --train-iters 1000 --lr-decay-style cosine --untie-embeddings-and-output-weights --disable-bias-linear --attention-dropout 0.0 --init-method-std 0.01 --hidden-dropout 0.0 --position-embedding-type rope --normalization RMSNorm --use-fused-rmsnorm --norm-epsilon 1e-6 --swiglu --use-flash-attn --use-fused-rotary-pos-emb --use-rotary-position-embeddings --use-fused-swiglu --use-mc2 --no-masked-softmax-fusion --attention-softmax-in-fp32 --min-lr 1.00e-7 --weight-decay 0.0 --lr-warmup-fraction 0.03 --clip-grad 1.0 --adam-beta1 0.9 --adam-beta2 0.999 --add-qkv-bias --initial-loss-scale 4096 --no-gradient-accumulation-fusion --use-distributed-optimizer --bf16 --overlap-grad-reduce --finetune --vision-model-freeze --vision-model-type intern_300m --vision-downsample-ratio 0.5 --vision-projector-type mlp --vision-projector-pre-norm --vision-process-type dynamic --vision-normalize-type imagenet --vision-seq-length 1025 --image-token-length 256 --image-size 448 --prompt-format qwen2 --is-instruction-dataset --max-num-frame 512 --max-fps 1 --add-class-token --min-patch-grid 1 --max-patch-grid 12 --cross-dataset-joint ' ++ DATA_ARGS=' + --data-path /local_disk/cognitron_vl//configs/lcvlm_finetune_stage3.yaml --split 100,0,0 --data-seq-length 131072 --num-workers 8 ' ++ CKPT_ARGS=' + --load /data_2/output/LM/lcvlm_modellink/scripts/qwen25/finetune_qwen25_14b_intern_300m_ptd_tp8pp1_stage2.sh/20241014_131952/ --vit-load / --no-load-optim --no-load-rng --seed 424242 --save /data_2/output/LM/scripts/modellink/qwen25/finetune_qwen25_14b_intern_300m_ptd_tp8pp1cp2_stage3.sh/20241127_204213// ' ++ OUTPUT_ARGS=' + --log-interval 1 --save-interval 100 --eval-interval 100 --eval-iters 0 --log-throughput --distributed-timeout-minutes 120 ' ++ torchrun --nproc_per_node 16 --nnodes 32 --node_rank 31 --master_addr train-1197954740059955456-93njiyzl9b0g-master-0.train-100034032793.svc.cluster.local --master_port 34567 /local_disk/cognitron_vl//lcvlm_modellink/pretrain_lcvlm.py --use-mcore-models --tensor-model-parallel-size 8 --pipeline-model-parallel-size 1 --context-parallel-size 2 --context-parallel-algo megatron_cp_algo --cp-attention-mask-type causal --use-cp-send-recv-overlap --no-create-attention-mask-in-dataloader --sparse-mode 4 --sequence-parallel --recompute-method block --recompute-granularity full --recompute-num-layers 48 --num-layers 48 --hidden-size 5120 --ffn-hidden-size 13824 --num-attention-heads 40 --group-query-attention --num-query-groups 8 --tokenizer-type PretrainedFromHF --tokenizer-name-or-path /data_4/models/Qwen/Qwen2.5-14B-Instruct/ --seq-length 131072 --max-position-embeddings 131072 --micro-batch-size 1 --global-batch-size 64 --make-vocab-size-divisible-by 1 --padded-vocab-size 152064 --rotary-base 1000000.0 --lr 5.00e-6 --train-iters 1000 --lr-decay-style cosine --untie-embeddings-and-output-weights --disable-bias-linear --attention-dropout 0.0 --init-method-std 0.01 --hidden-dropout 0.0 --position-embedding-type rope --normalization RMSNorm --use-fused-rmsnorm --norm-epsilon 1e-6 --swiglu --use-flash-attn --use-fused-rotary-pos-emb --use-rotary-position-embeddings --use-fused-swiglu --use-mc2 --no-masked-softmax-fusion --attention-softmax-in-fp32 --min-lr 1.00e-7 --weight-decay 0.0 --lr-warmup-fraction 0.03 --clip-grad 1.0 --adam-beta1 0.9 --adam-beta2 0.999 --add-qkv-bias --initial-loss-scale 4096 --no-gradient-accumulation-fusion --use-distributed-optimizer --bf16 --overlap-grad-reduce --finetune --vision-model-freeze --vision-model-type intern_300m --vision-downsample-ratio 0.5 --vision-projector-type mlp --vision-projector-pre-norm --vision-process-type dynamic --vision-normalize-type imagenet --vision-seq-length 1025 --image-token-length 256 --image-size 448 --prompt-format qwen2 --is-instruction-dataset --max-num-frame 512 --max-fps 1 --add-class-token --min-patch-grid 1 --max-patch-grid 12 --cross-dataset-joint --data-path /local_disk/cognitron_vl//configs/lcvlm_finetune_stage3.yaml --split 100,0,0 --data-seq-length 131072 --num-workers 8 --load /data_2/output/LM/lcvlm_modellink/scripts/qwen25/finetune_qwen25_14b_intern_300m_ptd_tp8pp1_stage2.sh/20241014_131952/ --vit-load / --no-load-optim --no-load-rng --seed 424242 --save /data_2/output/LM/scripts/modellink/qwen25/finetune_qwen25_14b_intern_300m_ptd_tp8pp1cp2_stage3.sh/20241127_204213// --log-interval 1 --save-interval 100 --eval-interval 100 --eval-iters 0 --log-throughput --distributed-timeout-minutes 120 --distributed-backend nccl +[2024-11-27 12:44:15,622] torch.distributed.run: [WARNING] +[2024-11-27 12:44:15,622] torch.distributed.run: [WARNING] ***************************************** +[2024-11-27 12:44:15,622] torch.distributed.run: [WARNING] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. +[2024-11-27 12:44:15,622] torch.distributed.run: [WARNING] ***************************************** +Using /root/.cache/torch_extensions/py38_cpu as PyTorch extensions root...Using /root/.cache/torch_extensions/py38_cpu as PyTorch extensions root... + +Creating extension directory /root/.cache/torch_extensions/py38_cpu/adaptive_cp... +Creating extension directory /root/.cache/torch_extensions/py38_cpu/adaptive_cp... +Using /root/.cache/torch_extensions/py38_cpu as PyTorch extensions root... +Using /root/.cache/torch_extensions/py38_cpu as PyTorch extensions root... +Using /root/.cache/torch_extensions/py38_cpu as PyTorch extensions root... +Emitting ninja build file /root/.cache/torch_extensions/py38_cpu/adaptive_cp/build.ninja... +Building extension module adaptive_cp... +Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N) +Using /root/.cache/torch_extensions/py38_cpu as PyTorch extensions root... +Using /root/.cache/torch_extensions/py38_cpu as PyTorch extensions root... +Using /root/.cache/torch_extensions/py38_cpu as PyTorch extensions root... +Using /root/.cache/torch_extensions/py38_cpu as PyTorch extensions root... +Using /root/.cache/torch_extensions/py38_cpu as PyTorch extensions root... +Using /root/.cache/torch_extensions/py38_cpu as PyTorch extensions root... +Using /root/.cache/torch_extensions/py38_cpu as PyTorch extensions root... +Using /root/.cache/torch_extensions/py38_cpu as PyTorch extensions root... +Using /root/.cache/torch_extensions/py38_cpu as PyTorch extensions root... +Using /root/.cache/torch_extensions/py38_cpu as PyTorch extensions root... +Using /root/.cache/torch_extensions/py38_cpu as PyTorch extensions root... +[1/2] c++ -MMD -MF adaptive_cp.o.d -DTORCH_EXTENSION_NAME=adaptive_cp -DTORCH_API_INCLUDE_EXTENSION_H -DPYBIND11_COMPILER_TYPE=\"_gcc\" -DPYBIND11_STDLIB=\"_libstdcpp\" -DPYBIND11_BUILD_ABI=\"_cxxabi1011\" -I/usr/local/Ascend/ascend-toolkit/latest/include -I/root/miniconda3/envs/py38/lib/python3.8/site-packages/torch_npu/include -I/root/miniconda3/envs/py38/lib/python3.8/site-packages/torch_npu/third_party -I/root/miniconda3/envs/py38/lib/python3.8/site-packages/torch_npu/acl -I/root/miniconda3/envs/py38/lib/python3.8/site-packages/torch_npu/inc -isystem /root/miniconda3/envs/py38/lib/python3.8/site-packages/torch/include -isystem /root/miniconda3/envs/py38/lib/python3.8/site-packages/torch/include/torch/csrc/api/include -isystem /root/miniconda3/envs/py38/lib/python3.8/site-packages/torch/include/TH -isystem /root/miniconda3/envs/py38/lib/python3.8/site-packages/torch/include/THC -isystem /root/miniconda3/envs/py38/include/python3.8 -D_GLIBCXX_USE_CXX11_ABI=0 -fPIC -std=c++17 -fstack-protector-all -Wl,-z,relro,-z,now,-z,noexecstack -fPIC -pie -Wl,--disable-new-dtags,--rpath -s -O2 -c local_disk/cognitron_vl/third_party/MindSpeed_core_r0.6.0/mindspeed/ops/csrc/algorithm/adaptive_cp/adaptive_cp.cpp -o adaptive_cp.o +[2/2] c++ adaptive_cp.o -shared -L/usr/local/Ascend/ascend-toolkit/latest/lib64 -lascendcl -L/root/miniconda3/envs/py38/lib/python3.8/site-packages/torch_npu/lib -ltorch_npu -L/root/miniconda3/envs/py38/lib/python3.8/site-packages/torch/lib -lc10 -ltorch_cpu -ltorch -ltorch_python -o adaptive_cp.so +Loading extension module adaptive_cp... +Loading extension module adaptive_cp... +local_disk/cognitron_vl/third_party/MindSpeed_core_r0.6.0/mindspeed/core/tensor_parallel/layers.py:30: UserWarning: failed to generate the npu_matmul_add_fp32 + warnings.warn("failed to generate the npu_matmul_add_fp32") +Loading extension module adaptive_cp... +local_disk/cognitron_vl/third_party/MindSpeed_core_r0.6.0/mindspeed/core/tensor_parallel/layers.py:30: UserWarning: failed to generate the npu_matmul_add_fp32 + warnings.warn("failed to generate the npu_matmul_add_fp32") +local_disk/cognitron_vl/third_party/MindSpeed_core_r0.6.0/mindspeed/core/tensor_parallel/layers.py:30: UserWarning: failed to generate the npu_matmul_add_fp32 + warnings.warn("failed to generate the npu_matmul_add_fp32") +Loading extension module adaptive_cp... +local_disk/cognitron_vl/third_party/MindSpeed_core_r0.6.0/mindspeed/core/tensor_parallel/layers.py:30: UserWarning: failed to generate the npu_matmul_add_fp32 + warnings.warn("failed to generate the npu_matmul_add_fp32") +Loading extension module adaptive_cp... +local_disk/cognitron_vl/third_party/MindSpeed_core_r0.6.0/mindspeed/core/tensor_parallel/layers.py:30: UserWarning: failed to generate the npu_matmul_add_fp32 + warnings.warn("failed to generate the npu_matmul_add_fp32") +Loading extension module adaptive_cp... +Loading extension module adaptive_cp... +local_disk/cognitron_vl/third_party/MindSpeed_core_r0.6.0/mindspeed/core/tensor_parallel/layers.py:30: UserWarning: failed to generate the npu_matmul_add_fp32 + warnings.warn("failed to generate the npu_matmul_add_fp32") +local_disk/cognitron_vl/third_party/MindSpeed_core_r0.6.0/mindspeed/core/tensor_parallel/layers.py:30: UserWarning: failed to generate the npu_matmul_add_fp32 + warnings.warn("failed to generate the npu_matmul_add_fp32") +Loading extension module adaptive_cp... +Loading extension module adaptive_cp... +local_disk/cognitron_vl/third_party/MindSpeed_core_r0.6.0/mindspeed/core/tensor_parallel/layers.py:30: UserWarning: failed to generate the npu_matmul_add_fp32 + warnings.warn("failed to generate the npu_matmul_add_fp32") +Loading extension module adaptive_cp... +local_disk/cognitron_vl/third_party/MindSpeed_core_r0.6.0/mindspeed/core/tensor_parallel/layers.py:30: UserWarning: failed to generate the npu_matmul_add_fp32 + warnings.warn("failed to generate the npu_matmul_add_fp32") +local_disk/cognitron_vl/third_party/MindSpeed_core_r0.6.0/mindspeed/core/tensor_parallel/layers.py:30: UserWarning: failed to generate the npu_matmul_add_fp32 + warnings.warn("failed to generate the npu_matmul_add_fp32") +Loading extension module adaptive_cp... +Loading extension module adaptive_cp... +local_disk/cognitron_vl/third_party/MindSpeed_core_r0.6.0/mindspeed/core/tensor_parallel/layers.py:30: UserWarning: failed to generate the npu_matmul_add_fp32 + warnings.warn("failed to generate the npu_matmul_add_fp32") +local_disk/cognitron_vl/third_party/MindSpeed_core_r0.6.0/mindspeed/core/tensor_parallel/layers.py:30: UserWarning: failed to generate the npu_matmul_add_fp32 + warnings.warn("failed to generate the npu_matmul_add_fp32") +Loading extension module adaptive_cp... +local_disk/cognitron_vl/third_party/MindSpeed_core_r0.6.0/mindspeed/core/tensor_parallel/layers.py:30: UserWarning: failed to generate the npu_matmul_add_fp32 + warnings.warn("failed to generate the npu_matmul_add_fp32") +Loading extension module adaptive_cp... +Loading extension module adaptive_cp... +local_disk/cognitron_vl/third_party/MindSpeed_core_r0.6.0/mindspeed/core/tensor_parallel/layers.py:30: UserWarning: failed to generate the npu_matmul_add_fp32 + warnings.warn("failed to generate the npu_matmul_add_fp32") +Loading extension module adaptive_cp... +local_disk/cognitron_vl/third_party/MindSpeed_core_r0.6.0/mindspeed/core/tensor_parallel/layers.py:30: UserWarning: failed to generate the npu_matmul_add_fp32 + warnings.warn("failed to generate the npu_matmul_add_fp32") +local_disk/cognitron_vl/third_party/MindSpeed_core_r0.6.0/mindspeed/core/tensor_parallel/layers.py:30: UserWarning: failed to generate the npu_matmul_add_fp32 + warnings.warn("failed to generate the npu_matmul_add_fp32") +/root/miniconda3/envs/py38/lib/python3.8/site-packages/torchvision/io/image.py:13: UserWarning: Failed to load image Python extension: 'libc10_cuda.so: cannot open shared object file: No such file or directory'If you don't plan on using image functionality from `torchvision.io`, you can ignore this warning. Otherwise, there might be something wrong with your environment. Did you have `libjpeg` or `libpng` installed before building `torchvision` from source? + warn( +/root/miniconda3/envs/py38/lib/python3.8/site-packages/torchvision/io/image.py:13: UserWarning: Failed to load image Python extension: 'libc10_cuda.so: cannot open shared object file: No such file or directory'If you don't plan on using image functionality from `torchvision.io`, you can ignore this warning. Otherwise, there might be something wrong with your environment. Did you have `libjpeg` or `libpng` installed before building `torchvision` from source? + warn( +/root/miniconda3/envs/py38/lib/python3.8/site-packages/torchvision/io/image.py:13: UserWarning: Failed to load image Python extension: 'libc10_cuda.so: cannot open shared object file: No such file or directory'If you don't plan on using image functionality from `torchvision.io`, you can ignore this warning. Otherwise, there might be something wrong with your environment. Did you have `libjpeg` or `libpng` installed before building `torchvision` from source? + warn( +/root/miniconda3/envs/py38/lib/python3.8/site-packages/torchvision/io/image.py:13: UserWarning: Failed to load image Python extension: 'libc10_cuda.so: cannot open shared object file: No such file or directory'If you don't plan on using image functionality from `torchvision.io`, you can ignore this warning. Otherwise, there might be something wrong with your environment. Did you have `libjpeg` or `libpng` installed before building `torchvision` from source? + warn( +/root/miniconda3/envs/py38/lib/python3.8/site-packages/torchvision/io/image.py:13: UserWarning: Failed to load image Python extension: 'libc10_cuda.so: cannot open shared object file: No such file or directory'If you don't plan on using image functionality from `torchvision.io`, you can ignore this warning. Otherwise, there might be something wrong with your environment. Did you have `libjpeg` or `libpng` installed before building `torchvision` from source? + warn( +/root/miniconda3/envs/py38/lib/python3.8/site-packages/torchvision/io/image.py:13: UserWarning: Failed to load image Python extension: 'libc10_cuda.so: cannot open shared object file: No such file or directory'If you don't plan on using image functionality from `torchvision.io`, you can ignore this warning. Otherwise, there might be something wrong with your environment. Did you have `libjpeg` or `libpng` installed before building `torchvision` from source? + warn( +/root/miniconda3/envs/py38/lib/python3.8/site-packages/torchvision/io/image.py:13: UserWarning: Failed to load image Python extension: 'libc10_cuda.so: cannot open shared object file: No such file or directory'If you don't plan on using image functionality from `torchvision.io`, you can ignore this warning. Otherwise, there might be something wrong with your environment. Did you have `libjpeg` or `libpng` installed before building `torchvision` from source? + warn( +/root/miniconda3/envs/py38/lib/python3.8/site-packages/torchvision/io/image.py:13: UserWarning: Failed to load image Python extension: 'libc10_cuda.so: cannot open shared object file: No such file or directory'If you don't plan on using image functionality from `torchvision.io`, you can ignore this warning. Otherwise, there might be something wrong with your environment. Did you have `libjpeg` or `libpng` installed before building `torchvision` from source? + warn( +/root/miniconda3/envs/py38/lib/python3.8/site-packages/torchvision/io/image.py:13: UserWarning: Failed to load image Python extension: 'libc10_cuda.so: cannot open shared object file: No such file or directory'If you don't plan on using image functionality from `torchvision.io`, you can ignore this warning. Otherwise, there might be something wrong with your environment. Did you have `libjpeg` or `libpng` installed before building `torchvision` from source? + warn( +/root/miniconda3/envs/py38/lib/python3.8/site-packages/torchvision/io/image.py:13: UserWarning: Failed to load image Python extension: 'libc10_cuda.so: cannot open shared object file: No such file or directory'If you don't plan on using image functionality from `torchvision.io`, you can ignore this warning. Otherwise, there might be something wrong with your environment. Did you have `libjpeg` or `libpng` installed before building `torchvision` from source? + warn( +/root/miniconda3/envs/py38/lib/python3.8/site-packages/torchvision/io/image.py:13: UserWarning: Failed to load image Python extension: 'libc10_cuda.so: cannot open shared object file: No such file or directory'If you don't plan on using image functionality from `torchvision.io`, you can ignore this warning. Otherwise, there might be something wrong with your environment. Did you have `libjpeg` or `libpng` installed before building `torchvision` from source? + warn( +/root/miniconda3/envs/py38/lib/python3.8/site-packages/torchvision/io/image.py:13: UserWarning: Failed to load image Python extension: 'libc10_cuda.so: cannot open shared object file: No such file or directory'If you don't plan on using image functionality from `torchvision.io`, you can ignore this warning. Otherwise, there might be something wrong with your environment. Did you have `libjpeg` or `libpng` installed before building `torchvision` from source? + warn( +/root/miniconda3/envs/py38/lib/python3.8/site-packages/torchvision/io/image.py:13: UserWarning: Failed to load image Python extension: 'libc10_cuda.so: cannot open shared object file: No such file or directory'If you don't plan on using image functionality from `torchvision.io`, you can ignore this warning. Otherwise, there might be something wrong with your environment. Did you have `libjpeg` or `libpng` installed before building `torchvision` from source? + warn( +/root/miniconda3/envs/py38/lib/python3.8/site-packages/torchvision/io/image.py:13: UserWarning: Failed to load image Python extension: 'libc10_cuda.so: cannot open shared object file: No such file or directory'If you don't plan on using image functionality from `torchvision.io`, you can ignore this warning. Otherwise, there might be something wrong with your environment. Did you have `libjpeg` or `libpng` installed before building `torchvision` from source? + warn( +/root/miniconda3/envs/py38/lib/python3.8/site-packages/torchvision/io/image.py:13: UserWarning: Failed to load image Python extension: 'libc10_cuda.so: cannot open shared object file: No such file or directory'If you don't plan on using image functionality from `torchvision.io`, you can ignore this warning. Otherwise, there might be something wrong with your environment. Did you have `libjpeg` or `libpng` installed before building `torchvision` from source? + warn( +/root/miniconda3/envs/py38/lib/python3.8/site-packages/torchvision/io/image.py:13: UserWarning: Failed to load image Python extension: 'libc10_cuda.so: cannot open shared object file: No such file or directory'If you don't plan on using image functionality from `torchvision.io`, you can ignore this warning. Otherwise, there might be something wrong with your environment. Did you have `libjpeg` or `libpng` installed before building `torchvision` from source? + warn( +> compiling dataset index builder ... +make: Entering directory 'local_disk/cognitron_vl/third_party/Megatron-LM_core_r0.6.0/megatron/core/datasets' +make: Nothing to be done for 'default'. +make: Leaving directory 'local_disk/cognitron_vl/third_party/Megatron-LM_core_r0.6.0/megatron/core/datasets' +>>> done with dataset index builder. Compilation time: 0.077 seconds +vision_projector_recompute False +vision_projector_recompute False +vision_projector_recompute Falsevision_projector_recompute False + +vision_projector_recompute False +vision_projector_recompute False +vision_projector_recompute False +vision_projector_recompute False +vision_projector_recompute False +vision_projector_recompute False +vision_projector_recompute False +vision_projector_recompute False +vision_projector_recompute False +vision_projector_recompute False +vision_projector_recompute False +vision_projector_recompute False +vision_model_freeze +=> set param external_feature_model.vit.class_token torch.Size([1, 1, 1024]) requires grad to False. +=> set param external_feature_model.vit.conv1.weight torch.Size([1024, 3, 14, 14]) requires grad to False. +=> set param external_feature_model.vit.conv1.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.position_embeddings.weight torch.Size([1025, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.0.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.0.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.0.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.0.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.0.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.0.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.1.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.1.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.1.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.1.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.1.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.1.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.2.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.2.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.2.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.2.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.2.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.2.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.3.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.3.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.3.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.3.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.3.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.3.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.4.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.4.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.4.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.4.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.4.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.4.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.5.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.5.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.5.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.5.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.5.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.5.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.6.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.6.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.6.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.6.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.6.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.6.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.7.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.7.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.7.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.7.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.7.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.7.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.8.ls1 torch.Size([1024]) requires grad to False. +vision_model_freeze=> set param external_feature_model.vit.decoder.layers.8.ls2 torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.8.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.8.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.8.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.8.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.9.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.class_token torch.Size([1, 1, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.9.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.9.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.9.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.conv1.weight torch.Size([1024, 3, 14, 14]) requires grad to False. +=> set param external_feature_model.vit.conv1.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.position_embeddings.weight torch.Size([1025, 1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.0.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.0.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.9.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.9.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.input_layernorm.weight torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.0.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.10.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.10.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.10.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.10.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.10.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.10.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.11.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.1.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.ls2 torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.1.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.11.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.11.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.1.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.1.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.11.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.11.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.1.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.1.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.12.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.12.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.12.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.2.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.12.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.2.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.2.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.2.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.12.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.12.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.2.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.2.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.13.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.13.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.13.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.13.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.3.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.3.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.input_layernorm.weight torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.3.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.13.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.13.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.3.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.3.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.14.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.14.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.14.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.14.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.4.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.4.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.4.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.4.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.14.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.14.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.15.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.15.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.4.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.15.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.15.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.5.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.5.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.15.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.5.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.5.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.16.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.ls2 torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.16.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.16.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.5.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.5.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.16.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.ls1 torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.16.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.6.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.6.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.6.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.17.ls1 torch.Size([1024]) requires grad to False. +vision_model_freeze=> set param external_feature_model.vit.decoder.layers.17.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. + + +=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.17.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.17.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.6.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.6.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.7.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.7.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.7.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.7.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.17.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.17.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.class_token torch.Size([1, 1, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. + + +=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.conv1.weight torch.Size([1024, 3, 14, 14]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.7.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.conv1.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.7.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.18.ls1 torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.18.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.position_embeddings.weight torch.Size([1025, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.input_layernorm.weight torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.18.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.0.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.8.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. + + +=> set param external_feature_model.vit.decoder.layers.0.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.8.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.8.input_layernorm.weight torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.0.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.8.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.0.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.18.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.18.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.19.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.19.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.8.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.8.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.19.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.19.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +vision_model_freeze=> set param external_feature_model.vit.decoder.layers.0.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. + + + +=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.ls1 torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.9.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.19.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.input_layernorm.weight torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.19.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.input_layernorm.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.1.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. + + +=> set param external_feature_model.vit.decoder.layers.1.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.20.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.20.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.9.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.9.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.input_layernorm.weight torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.20.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.class_token torch.Size([1, 1, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.conv1.weight torch.Size([1024, 3, 14, 14]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.conv1.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.10.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.1.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.10.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.1.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.20.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.10.input_layernorm.weight torch.Size([1024]) requires grad to False.vision_model_freeze=> set param external_feature_model.vit.position_embeddings.weight torch.Size([1025, 1024]) requires grad to False. + + +=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.10.input_layernorm.bias torch.Size([1024]) requires grad to False. + + +=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. + + +=> set param external_feature_model.vit.decoder.layers.0.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. + + +=> set param external_feature_model.vit.decoder.layers.0.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.0.input_layernorm.weight torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.2.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.10.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.ls1 torch.Size([1024]) requires grad to False. + + + +=> set param external_feature_model.vit.decoder.layers.10.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.21.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.2.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.class_token torch.Size([1, 1, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.21.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. + + +=> set param external_feature_model.vit.decoder.layers.2.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.21.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.conv1.weight torch.Size([1024, 3, 14, 14]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. + + +=> set param external_feature_model.vit.conv1.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. + + + + +=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. + +=> set param external_feature_model.vit.position_embeddings.weight torch.Size([1025, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.11.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.0.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.input_layernorm.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.0.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.0.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.2.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.0.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.2.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.0.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.0.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. + + +=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.11.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. + + +=> set param external_feature_model.vit.decoder.layers.1.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.1.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. + + +=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.3.ls2 torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.1.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.22.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.22.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. + + + +=> set param external_feature_model.vit.decoder.layers.22.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.input_layernorm.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.0.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.input_layernorm.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.12.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.0.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. + + +=> set param external_feature_model.vit.decoder.layers.12.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. + + +=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.12.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. + + +=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. + + + +=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.3.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. + + +=> set param external_feature_model.vit.decoder.layers.1.ls1 torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.22.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. + + + + +=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.1.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.12.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.1.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.12.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. + + +=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.ls1 torch.Size([1024]) requires grad to False. + + + + +=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.ls1 torch.Size([1024]) requires grad to False. + + + + +=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.23.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.2.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.4.input_layernorm.weight torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.23.input_layernorm.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.13.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.input_layernorm.bias torch.Size([1024]) requires grad to False. + + + +=> set param external_feature_model.vit.decoder.layers.13.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.input_layernorm.bias torch.Size([1024]) requires grad to False. + + +=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.13.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.1.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.1.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. + + +=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.23.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.2.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. + + +=> set param external_feature_model.vit.decoder.layers.2.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. + + + + +=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. + + + +=> set param external_feature_model.vit.decoder.layers.2.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.13.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.2.input_layernorm.weight torch.Size([1024]) requires grad to False. + + +=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.input_layernorm.bias torch.Size([1024]) requires grad to False. + + +=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.ls1 torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.3.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.5.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.3.input_layernorm.weight torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.5.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.3.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. + + +=> set param external_feature_model.vit.decoder.layers.14.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.5.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.14.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.5.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.14.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.2.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.2.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. + + +=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. + + +=> set param external_feature_model.vit.decoder.layers.3.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. + + +=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. + + +=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.5.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. + + +=> set param external_feature_model.vit.decoder.layers.3.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. + + +=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.3.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.input_layernorm.bias torch.Size([1024]) requires grad to False. + + +=> set param external_feature_model.vit.decoder.layers.4.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.4.ls2 torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. + + +=> set param external_feature_model.vit.decoder.layers.4.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.15.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.6.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.input_layernorm.bias torch.Size([1024]) requires grad to False. + + +=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.15.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.ls2 torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.6.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.input_layernorm.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.3.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. + + +=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. + + +=> set param external_feature_model.vit.decoder.layers.4.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.15.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.15.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. + + + +=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.4.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.4.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.6.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +vision_model_freeze +=> set param external_feature_model.vit.decoder.layers.5.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. + + + +=> set param external_feature_model.vit.decoder.layers.4.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.ls2 torch.Size([1024]) requires grad to False. + + +=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.16.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.5.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.input_layernorm.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.7.ls1 torch.Size([1024]) requires grad to False. + + +=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.7.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. + + +=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.7.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.class_token torch.Size([1, 1, 1024]) requires grad to False. + + + + +=> set param external_feature_model.vit.decoder.layers.7.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.16.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.conv1.weight torch.Size([1024, 3, 14, 14]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. + + + +=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.conv1.bias torch.Size([1024]) requires grad to False. + + +=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.5.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.position_embeddings.weight torch.Size([1025, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. + + +=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.5.ls1 torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.5.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.7.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.17.ls1 torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.0.ls2 torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.7.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.ls2 torch.Size([1024]) requires grad to False. + + +=> set param external_feature_model.vit.decoder.layers.6.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.5.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.input_layernorm.weight torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.6.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.0.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.17.input_layernorm.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.6.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.6.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.8.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. + + +=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. + + +=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.8.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. + + +=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.8.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.8.input_layernorm.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.17.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.17.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. + + +=> set param external_feature_model.vit.decoder.layers.0.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. + + +=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.6.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.8.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.7.ls1 torch.Size([1024]) requires grad to False. + + +=> set param external_feature_model.vit.decoder.layers.18.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.8.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.6.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.7.ls2 torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.18.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.6.input_layernorm.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.7.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.1.ls2 torch.Size([1024]) requires grad to False. + + +=> set param external_feature_model.vit.decoder.layers.7.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.1.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. + + +=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.input_layernorm.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. + + +=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.6.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.ls2 torch.Size([1024]) requires grad to False. + + +=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.9.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.7.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.9.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.7.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.1.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.1.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.7.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. + + +=> set param external_feature_model.vit.decoder.layers.19.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.7.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.19.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.7.input_layernorm.weight torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.19.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.8.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.7.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. + + + +=> set param external_feature_model.vit.decoder.layers.8.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.8.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.2.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.8.input_layernorm.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.2.ls2 torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.2.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.2.input_layernorm.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.7.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.19.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. + + +=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.7.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.19.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.10.ls1 torch.Size([1024]) requires grad to False. + + +=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.10.ls2 torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.8.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. + + +=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.8.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.10.input_layernorm.weight torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.10.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. + + + +=> set param external_feature_model.vit.decoder.layers.8.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.20.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.8.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.8.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. + + +=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.20.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.8.input_layernorm.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.9.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. + + + +=> set param external_feature_model.vit.decoder.layers.9.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.10.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.input_layernorm.weight torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.3.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.10.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.9.input_layernorm.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.ls2 torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. + + +=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.3.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.8.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. + + +=> set param external_feature_model.vit.decoder.layers.20.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.8.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.20.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.11.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.11.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.input_layernorm.weight torch.Size([1024]) requires grad to False. + + +=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. + + +=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. + + +=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.3.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.9.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.3.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.21.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. + + +=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.9.input_layernorm.weight torch.Size([1024]) requires grad to False. + + + +=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.21.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.9.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.10.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. + + +=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.10.ls2 torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.10.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.4.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. + + + +=> set param external_feature_model.vit.decoder.layers.10.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.11.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. + + +=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.4.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.4.input_layernorm.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. + + +=> set param external_feature_model.vit.decoder.layers.12.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.12.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.10.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.21.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.10.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. + + + +=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.4.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. + + + +=> set param external_feature_model.vit.decoder.layers.4.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.10.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. + + + + +=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.10.ls2 torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.10.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.10.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. + + +=> set param external_feature_model.vit.decoder.layers.22.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.ls2 torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.12.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.ls2 torch.Size([1024]) requires grad to False. + + + +=> set param external_feature_model.vit.decoder.layers.5.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.11.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.11.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.22.input_layernorm.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.5.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.13.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.10.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.5.input_layernorm.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.13.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.10.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. + + +=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. + + +=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.13.input_layernorm.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. + + +=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. + + +=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.11.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.22.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.11.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. + + +=> set param external_feature_model.vit.decoder.layers.5.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.input_layernorm.weight torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.5.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. + + + + +=> set param external_feature_model.vit.decoder.layers.13.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.ls2 torch.Size([1024]) requires grad to False. + + + + +=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.12.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. + + +=> set param external_feature_model.vit.decoder.layers.23.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.ls2 torch.Size([1024]) requires grad to False. + + + +=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.6.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.23.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.6.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.11.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.ls1 torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.6.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. + + +=> set param external_feature_model.vit.decoder.layers.14.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. + + +=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. + + + +=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.input_layernorm.bias torch.Size([1024]) requires grad to False. + + +=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. + + +=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.12.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. + + +=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.ls1 torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.23.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.12.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.6.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. + + +=> set param external_feature_model.vit.decoder.layers.12.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.6.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. + + + +=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. + + +=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. + + +=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.13.ls1 torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.13.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. + + +=> set param external_feature_model.vit.decoder.layers.13.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.7.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.12.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. + + +=> set param external_feature_model.vit.decoder.layers.7.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.12.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.15.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.7.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.ls2 torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.7.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.15.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.15.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.13.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.13.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. + + +=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.13.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.13.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.7.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. + + +=> set param external_feature_model.vit.decoder.layers.13.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.7.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.15.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.15.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.ls1 torch.Size([1024]) requires grad to False. + + +=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.14.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. + + +=> set param external_feature_model.vit.decoder.layers.14.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.8.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.13.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.8.ls2 torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.ls2 torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.8.input_layernorm.weight torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.8.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.input_layernorm.weight torch.Size([1024]) requires grad to False. + + +=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.16.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.14.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. + + +=> set param external_feature_model.vit.decoder.layers.14.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.14.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.14.ls2 torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.14.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.14.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.16.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.8.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. + + +=> set param external_feature_model.vit.decoder.layers.16.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.8.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. + + +=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.15.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.15.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.17.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.14.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.ls2 torch.Size([1024]) requires grad to False. + + +=> set param external_feature_model.vit.decoder.layers.17.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.ls1 torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.17.input_layernorm.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.input_layernorm.weight torch.Size([1024]) requires grad to False. + + + +=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.input_layernorm.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.15.ls1 torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.ls2 torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.15.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.9.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.17.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. + + +=> set param external_feature_model.vit.decoder.layers.16.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.16.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. + + +=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. + + +=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.input_layernorm.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.15.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.10.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. + + +=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.10.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.18.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.18.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.10.input_layernorm.weight torch.Size([1024]) requires grad to False. + + +=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.10.input_layernorm.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.ls1 torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.ls2 torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.input_layernorm.weight torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.input_layernorm.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. + + +=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.10.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. + + + +=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.vision_model_freeze +=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.10.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. + + + +=> set param external_feature_model.vit.decoder.layers.17.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.17.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.16.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.input_layernorm.weight torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. + + + +=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.19.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.11.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.19.input_layernorm.weight torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.11.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.class_token torch.Size([1, 1, 1024]) requires grad to False. + + + +=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.11.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. + + +=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.conv1.weight torch.Size([1024, 3, 14, 14]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.17.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.conv1.bias torch.Size([1024]) requires grad to False. + + +=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.position_embeddings.weight torch.Size([1025, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.17.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.17.input_layernorm.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.17.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.19.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.11.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.0.ls1 torch.Size([1024]) requires grad to False. + + +=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.ls2 torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. + + +=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.0.input_layernorm.weight torch.Size([1024]) requires grad to False. + + +=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.ls1 torch.Size([1024]) requires grad to False. + + +=> set param external_feature_model.vit.decoder.layers.0.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.20.ls2 torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.18.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.20.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.12.ls1 torch.Size([1024]) requires grad to False. + + +=> set param external_feature_model.vit.decoder.layers.17.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.input_layernorm.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.12.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. + + +=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. + + +=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. + + +=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.18.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.18.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.20.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.18.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.20.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.input_layernorm.weight torch.Size([1024]) requires grad to False. + + +=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. + + + +=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. + + + + +=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. + + +=> set param external_feature_model.vit.decoder.layers.21.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.19.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.input_layernorm.weight torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.21.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. + + +=> set param external_feature_model.vit.decoder.layers.13.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.13.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.ls1 torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.13.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.18.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.13.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.19.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.input_layernorm.weight torch.Size([1024]) requires grad to False. + + +=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.19.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. + + +=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.19.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.21.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.19.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.21.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. + + +=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. + + +=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. + + + +=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. + + +=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.20.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.22.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.1.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.20.input_layernorm.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.14.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.ls2 torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.19.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.ls2 torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.19.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. + + +=> set param external_feature_model.vit.decoder.layers.14.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.input_layernorm.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.14.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. + + +=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.20.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. + + +=> set param external_feature_model.vit.decoder.layers.20.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.ls2 torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.22.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.ls2 torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.2.input_layernorm.weight torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.input_layernorm.weight torch.Size([1024]) requires grad to False. + + + +=> set param external_feature_model.vit.decoder.layers.2.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.14.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.21.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. + + + +=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.23.ls1 torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.23.ls2 torch.Size([1024]) requires grad to False. + + +=> set param external_feature_model.vit.decoder.layers.15.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.20.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.20.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.2.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.input_layernorm.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.15.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.15.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. + + +=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. + + +=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. + + +=> set param external_feature_model.vit.decoder.layers.21.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. + + +=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.23.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.21.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.21.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.21.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.3.ls1 torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.15.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.3.ls2 torch.Size([1024]) requires grad to False. + + +=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.3.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.3.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. + + +=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.ls1 torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.ls2 torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.16.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.ls2 torch.Size([1024]) requires grad to False. + + +=> set param external_feature_model.vit.decoder.layers.22.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.21.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.input_layernorm.weight torch.Size([1024]) requires grad to False. + + +=> set param external_feature_model.vit.decoder.layers.21.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. + + + +=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. + + +=> set param external_feature_model.vit.decoder.layers.3.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. + + +=> set param external_feature_model.vit.decoder.layers.3.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.22.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.22.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. + + +=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.input_layernorm.weight torch.Size([1024]) requires grad to False. + + +=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.22.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. + + +=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.23.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.ls1 torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.23.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.4.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.23.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.ls2 torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.23.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.22.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.input_layernorm.weight torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.22.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.input_layernorm.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.4.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.4.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.23.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.ls1 torch.Size([1024]) requires grad to False. + + +=> set param external_feature_model.vit.decoder.layers.23.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.ls2 torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.23.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. + + +=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.17.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.4.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.4.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.23.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.23.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.18.ls1 torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.ls2 torch.Size([1024]) requires grad to False. + + +=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.18.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.5.ls1 torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.5.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.5.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.5.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.18.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.19.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.19.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.19.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.19.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.5.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.5.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.19.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.19.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.ls1 torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.6.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.20.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.20.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.6.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.20.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.20.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.input_layernorm.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.20.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.model GPTVLModel( + (external_feature_model): MegatronVisionModel( + (vit): InternViTModel( + (conv1): Conv2d(3, 1024, kernel_size=(14, 14), stride=(14, 14)) + (position_embeddings): Embedding(1025, 1024) + (decoder): TransformerBlock( + (layers): ModuleList( + (0-23): 24 x InternViTTransformerLayer( + (input_layernorm): LayerNorm((1024,), eps=1e-06, elementwise_affine=True) + (self_attention): SelfAttention( + (core_attention): DotProductAttention( + (scale_mask_softmax): FusedScaleMaskSoftmax() + (attention_dropout): Dropout(p=0.0, inplace=False) + ) + (linear_proj): RowParallelLinear() + (linear_qkv): ColumnParallelLinear() + ) + (self_attn_bda): IdentityFuncOp() + (pre_cross_attn_layernorm): IdentityOp() + (cross_attention): IdentityOp() + (cross_attn_bda): IdentityFuncOp() + (pre_mlp_layernorm): LayerNorm((1024,), eps=1e-06, elementwise_affine=True) + (mlp): MLP( + (linear_fc1): ColumnParallelLinear() + (linear_fc2): RowParallelLinear() + ) + (mlp_bda): IdentityFuncOp() + ) + ) + ) + ) + (vision_projection): MultimodalProjector( + (encoder): MLP( + (linear_fc1): ColumnParallelLinear() + (linear_fc2): RowParallelLinear() + ) + ) + (pre_proj_layernorm): LayerNorm((4096,), eps=1e-05, elementwise_affine=True) + ) + (embedding): LanguageModelEmbedding( + (word_embeddings): VocabParallelEmbedding() + (embedding_dropout): Dropout(p=0.0, inplace=False) + ) + (rotary_pos_emb): RotaryEmbedding() + (decoder): TransformerBlock( + (layers): ModuleList( + (0-47): 48 x TransformerLayer( + (input_layernorm): RMSNorm() + (self_attention): SelfAttention( + (core_attention): DotProductAttention( + (scale_mask_softmax): FusedScaleMaskSoftmax() + (attention_dropout): Dropout(p=0.0, inplace=False) + ) + (linear_proj): RowParallelLinear() + (linear_qkv): ColumnParallelLinear() + (q_layernorm): IdentityOp() + (k_layernorm): IdentityOp() + ) + (pre_cross_attn_layernorm): IdentityOp() + (cross_attention): IdentityOp() + (cross_attn_bda): IdentityFuncOp() + (pre_mlp_layernorm): RMSNorm() + (mlp): MLP( + (linear_fc1): ColumnParallelLinear() + (linear_fc2): RowParallelLinear() + ) + ) + ) + (final_layernorm): RMSNorm() + ) + (output_layer): ColumnParallelLinear() +) + +=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.6.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.6.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.21.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.21.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.input_layernorm.weight torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.21.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.7.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.7.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.7.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.7.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.21.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.21.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.22.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.22.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.22.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.7.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.input_layernorm.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.7.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.22.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.8.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.8.ls2 torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.8.input_layernorm.weight torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.8.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.ls1 torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.23.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.23.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.23.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.23.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.23.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.8.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.8.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.9.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.9.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.9.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.9.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.9.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.9.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.10.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.10.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.10.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.10.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.10.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.10.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.11.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.11.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.11.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.11.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.11.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.11.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.12.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.12.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.12.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.12.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.12.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.12.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.13.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.13.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.13.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.13.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.13.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.13.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.14.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.14.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.14.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.14.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.14.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.14.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.15.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.15.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.15.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.15.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.15.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.15.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.16.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.16.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.16.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.16.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.16.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.16.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.17.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.17.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.17.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.17.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.17.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.17.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.18.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.18.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.18.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.18.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +model GPTVLModel( + (external_feature_model): MegatronVisionModel( + (vit): InternViTModel( + (conv1): Conv2d(3, 1024, kernel_size=(14, 14), stride=(14, 14)) + (position_embeddings): Embedding(1025, 1024) + (decoder): TransformerBlock( + (layers): ModuleList( + (0-23): 24 x InternViTTransformerLayer( + (input_layernorm): LayerNorm((1024,), eps=1e-06, elementwise_affine=True) + (self_attention): SelfAttention( + (core_attention): DotProductAttention( + (scale_mask_softmax): FusedScaleMaskSoftmax() + (attention_dropout): Dropout(p=0.0, inplace=False) + ) + (linear_proj): RowParallelLinear() + (linear_qkv): ColumnParallelLinear() + ) + (self_attn_bda): IdentityFuncOp() + (pre_cross_attn_layernorm): IdentityOp() + (cross_attention): IdentityOp() + (cross_attn_bda): IdentityFuncOp() + (pre_mlp_layernorm): LayerNorm((1024,), eps=1e-06, elementwise_affine=True) + (mlp): MLP( + (linear_fc1): ColumnParallelLinear() + (linear_fc2): RowParallelLinear() + ) + (mlp_bda): IdentityFuncOp() + ) + ) + ) + ) + (vision_projection): MultimodalProjector( + (encoder): MLP( + (linear_fc1): ColumnParallelLinear() + (linear_fc2): RowParallelLinear() + ) + ) + (pre_proj_layernorm): LayerNorm((4096,), eps=1e-05, elementwise_affine=True) + ) + (embedding): LanguageModelEmbedding( + (word_embeddings): VocabParallelEmbedding() + (embedding_dropout): Dropout(p=0.0, inplace=False) + ) + (rotary_pos_emb): RotaryEmbedding() + (decoder): TransformerBlock( + (layers): ModuleList( + (0-47): 48 x TransformerLayer( + (input_layernorm): RMSNorm() + (self_attention): SelfAttention( + (core_attention): DotProductAttention( + (scale_mask_softmax): FusedScaleMaskSoftmax() + (attention_dropout): Dropout(p=0.0, inplace=False) + ) + (linear_proj): RowParallelLinear() + (linear_qkv): ColumnParallelLinear() + (q_layernorm): IdentityOp() + (k_layernorm): IdentityOp() + ) + (pre_cross_attn_layernorm): IdentityOp() + (cross_attention): IdentityOp() + (cross_attn_bda): IdentityFuncOp() + (pre_mlp_layernorm): RMSNorm() + (mlp): MLP( + (linear_fc1): ColumnParallelLinear() + (linear_fc2): RowParallelLinear() + ) + ) + ) + (final_layernorm): RMSNorm() + ) + (output_layer): ColumnParallelLinear() +)=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.18.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.18.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.19.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.19.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.19.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.19.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.19.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.19.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.20.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.20.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.20.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.20.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.20.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.20.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.21.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.21.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.21.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.21.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.21.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.21.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.22.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.22.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.22.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.22.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.22.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.22.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.23.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.23.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.23.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.23.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.23.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.23.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +model GPTVLModel( + (external_feature_model): MegatronVisionModel( + (vit): InternViTModel( + (conv1): Conv2d(3, 1024, kernel_size=(14, 14), stride=(14, 14)) + (position_embeddings): Embedding(1025, 1024) + (decoder): TransformerBlock( + (layers): ModuleList( + (0-23): 24 x InternViTTransformerLayer( + (input_layernorm): LayerNorm((1024,), eps=1e-06, elementwise_affine=True) + (self_attention): SelfAttention( + (core_attention): DotProductAttention( + (scale_mask_softmax): FusedScaleMaskSoftmax() + (attention_dropout): Dropout(p=0.0, inplace=False) + ) + (linear_proj): RowParallelLinear() + (linear_qkv): ColumnParallelLinear() + ) + (self_attn_bda): IdentityFuncOp() + (pre_cross_attn_layernorm): IdentityOp() + (cross_attention): IdentityOp() + (cross_attn_bda): IdentityFuncOp() + (pre_mlp_layernorm): LayerNorm((1024,), eps=1e-06, elementwise_affine=True) + (mlp): MLP( + (linear_fc1): ColumnParallelLinear() + (linear_fc2): RowParallelLinear() + ) + (mlp_bda): IdentityFuncOp() + ) + ) + ) + ) + (vision_projection): MultimodalProjector( + (encoder): MLP( + (linear_fc1): ColumnParallelLinear() + (linear_fc2): RowParallelLinear() + ) + ) + (pre_proj_layernorm): LayerNorm((4096,), eps=1e-05, elementwise_affine=True) + ) + (embedding): LanguageModelEmbedding( + (word_embeddings): VocabParallelEmbedding() + (embedding_dropout): Dropout(p=0.0, inplace=False) + ) + (rotary_pos_emb): RotaryEmbedding() + (decoder): TransformerBlock( + (layers): ModuleList( + (0-47): 48 x TransformerLayer( + (input_layernorm): RMSNorm() + (self_attention): SelfAttention( + (core_attention): DotProductAttention( + (scale_mask_softmax): FusedScaleMaskSoftmax() + (attention_dropout): Dropout(p=0.0, inplace=False) + ) + (linear_proj): RowParallelLinear() + (linear_qkv): ColumnParallelLinear() + (q_layernorm): IdentityOp() + (k_layernorm): IdentityOp() + ) + (pre_cross_attn_layernorm): IdentityOp() + (cross_attention): IdentityOp() + (cross_attn_bda): IdentityFuncOp() + (pre_mlp_layernorm): RMSNorm() + (mlp): MLP( + (linear_fc1): ColumnParallelLinear() + (linear_fc2): RowParallelLinear() + ) + ) + ) + (final_layernorm): RMSNorm() + ) + (output_layer): ColumnParallelLinear() +) +model GPTVLModel( + (external_feature_model): MegatronVisionModel( + (vit): InternViTModel( + (conv1): Conv2d(3, 1024, kernel_size=(14, 14), stride=(14, 14)) + (position_embeddings): Embedding(1025, 1024) + (decoder): TransformerBlock( + (layers): ModuleList( + (0-23): 24 x InternViTTransformerLayer( + (input_layernorm): LayerNorm((1024,), eps=1e-06, elementwise_affine=True) + (self_attention): SelfAttention( + (core_attention): DotProductAttention( + (scale_mask_softmax): FusedScaleMaskSoftmax() + (attention_dropout): Dropout(p=0.0, inplace=False) + ) + (linear_proj): RowParallelLinear() + (linear_qkv): ColumnParallelLinear() + ) + (self_attn_bda): IdentityFuncOp() + (pre_cross_attn_layernorm): IdentityOp() + (cross_attention): IdentityOp() + (cross_attn_bda): IdentityFuncOp() + (pre_mlp_layernorm): LayerNorm((1024,), eps=1e-06, elementwise_affine=True) + (mlp): MLP( + (linear_fc1): ColumnParallelLinear() + (linear_fc2): RowParallelLinear() + ) + (mlp_bda): IdentityFuncOp() + ) + ) + ) + ) + (vision_projection): MultimodalProjector( + (encoder): MLP( + (linear_fc1): ColumnParallelLinear() + (linear_fc2): RowParallelLinear() + ) + ) + (pre_proj_layernorm): LayerNorm((4096,), eps=1e-05, elementwise_affine=True) + ) + (embedding): LanguageModelEmbedding( + (word_embeddings): VocabParallelEmbedding() + (embedding_dropout): Dropout(p=0.0, inplace=False) + ) + (rotary_pos_emb): RotaryEmbedding() + (decoder): TransformerBlock( + (layers): ModuleList( + (0-47): 48 x TransformerLayer( + (input_layernorm): RMSNorm() + (self_attention): SelfAttention( + (core_attention): DotProductAttention( + (scale_mask_softmax): FusedScaleMaskSoftmax() + (attention_dropout): Dropout(p=0.0, inplace=False) + ) + (linear_proj): RowParallelLinear() + (linear_qkv): ColumnParallelLinear() + (q_layernorm): IdentityOp() + (k_layernorm): IdentityOp() + ) + (pre_cross_attn_layernorm): IdentityOp() + (cross_attention): IdentityOp() + (cross_attn_bda): IdentityFuncOp() + (pre_mlp_layernorm): RMSNorm() + (mlp): MLP( + (linear_fc1): ColumnParallelLinear() + (linear_fc2): RowParallelLinear() + ) + ) + ) + (final_layernorm): RMSNorm() + ) + (output_layer): ColumnParallelLinear() +) +model GPTVLModel( + (external_feature_model): MegatronVisionModel( + (vit): InternViTModel( + (conv1): Conv2d(3, 1024, kernel_size=(14, 14), stride=(14, 14)) + (position_embeddings): Embedding(1025, 1024) + (decoder): TransformerBlock( + (layers): ModuleList( + (0-23): 24 x InternViTTransformerLayer( + (input_layernorm): LayerNorm((1024,), eps=1e-06, elementwise_affine=True) + (self_attention): SelfAttention( + (core_attention): DotProductAttention( + (scale_mask_softmax): FusedScaleMaskSoftmax() + (attention_dropout): Dropout(p=0.0, inplace=False) + ) + (linear_proj): RowParallelLinear() + (linear_qkv): ColumnParallelLinear() + ) + (self_attn_bda): IdentityFuncOp() + (pre_cross_attn_layernorm): IdentityOp() + (cross_attention): IdentityOp() + (cross_attn_bda): IdentityFuncOp() + (pre_mlp_layernorm): LayerNorm((1024,), eps=1e-06, elementwise_affine=True) + (mlp): MLP( + (linear_fc1): ColumnParallelLinear() + (linear_fc2): RowParallelLinear() + ) + (mlp_bda): IdentityFuncOp() + ) + ) + ) + ) + (vision_projection): MultimodalProjector( + (encoder): MLP( + (linear_fc1): ColumnParallelLinear() + (linear_fc2): RowParallelLinear() + ) + ) + (pre_proj_layernorm): LayerNorm((4096,), eps=1e-05, elementwise_affine=True) + ) + (embedding): LanguageModelEmbedding( + (word_embeddings): VocabParallelEmbedding() + (embedding_dropout): Dropout(p=0.0, inplace=False) + ) + (rotary_pos_emb): RotaryEmbedding() + (decoder): TransformerBlock( + (layers): ModuleList( + (0-47): 48 x TransformerLayer( + (input_layernorm): RMSNorm() + (self_attention): SelfAttention( + (core_attention): DotProductAttention( + (scale_mask_softmax): FusedScaleMaskSoftmax() + (attention_dropout): Dropout(p=0.0, inplace=False) + ) + (linear_proj): RowParallelLinear() + (linear_qkv): ColumnParallelLinear() + (q_layernorm): IdentityOp() + (k_layernorm): IdentityOp() + ) + (pre_cross_attn_layernorm): IdentityOp() + (cross_attention): IdentityOp() + (cross_attn_bda): IdentityFuncOp() + (pre_mlp_layernorm): RMSNorm() + (mlp): MLP( + (linear_fc1): ColumnParallelLinear() + (linear_fc2): RowParallelLinear() + ) + ) + ) + (final_layernorm): RMSNorm() + ) + (output_layer): ColumnParallelLinear() +) +model GPTVLModel( + (external_feature_model): MegatronVisionModel( + (vit): InternViTModel( + (conv1): Conv2d(3, 1024, kernel_size=(14, 14), stride=(14, 14)) + (position_embeddings): Embedding(1025, 1024) + (decoder): TransformerBlock( + (layers): ModuleList( + (0-23): 24 x InternViTTransformerLayer( + (input_layernorm): LayerNorm((1024,), eps=1e-06, elementwise_affine=True) + (self_attention): SelfAttention( + (core_attention): DotProductAttention( + (scale_mask_softmax): FusedScaleMaskSoftmax() + (attention_dropout): Dropout(p=0.0, inplace=False) + ) + (linear_proj): RowParallelLinear() + (linear_qkv): ColumnParallelLinear() + ) + (self_attn_bda): IdentityFuncOp() + (pre_cross_attn_layernorm): IdentityOp() + (cross_attention): IdentityOp() + (cross_attn_bda): IdentityFuncOp() + (pre_mlp_layernorm): LayerNorm((1024,), eps=1e-06, elementwise_affine=True) + (mlp): MLP( + (linear_fc1): ColumnParallelLinear() + (linear_fc2): RowParallelLinear() + ) + (mlp_bda): IdentityFuncOp() + ) + ) + ) + ) + (vision_projection): MultimodalProjector( + (encoder): MLP( + (linear_fc1): ColumnParallelLinear() + (linear_fc2): RowParallelLinear() + ) + ) + (pre_proj_layernorm): LayerNorm((4096,), eps=1e-05, elementwise_affine=True) + ) + (embedding): LanguageModelEmbedding( + (word_embeddings): VocabParallelEmbedding() + (embedding_dropout): Dropout(p=0.0, inplace=False) + ) + (rotary_pos_emb): RotaryEmbedding() + (decoder): TransformerBlock( + (layers): ModuleList( + (0-47): 48 x TransformerLayer( + (input_layernorm): RMSNorm() + (self_attention): SelfAttention( + (core_attention): DotProductAttention( + (scale_mask_softmax): FusedScaleMaskSoftmax() + (attention_dropout): Dropout(p=0.0, inplace=False) + ) + (linear_proj): RowParallelLinear() + (linear_qkv): ColumnParallelLinear() + (q_layernorm): IdentityOp() + (k_layernorm): IdentityOp() + ) + (pre_cross_attn_layernorm): IdentityOp() + (cross_attention): IdentityOp() + (cross_attn_bda): IdentityFuncOp() + (pre_mlp_layernorm): RMSNorm() + (mlp): MLP( + (linear_fc1): ColumnParallelLinear() + (linear_fc2): RowParallelLinear() + ) + ) + ) + (final_layernorm): RMSNorm() + ) + (output_layer): ColumnParallelLinear() +) +vision_model_freeze +=> set param external_feature_model.vit.class_token torch.Size([1, 1, 1024]) requires grad to False. +=> set param external_feature_model.vit.conv1.weight torch.Size([1024, 3, 14, 14]) requires grad to False. +=> set param external_feature_model.vit.conv1.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.position_embeddings.weight torch.Size([1025, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.0.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.0.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.0.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.0.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.0.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.0.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.1.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.1.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.1.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.1.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.1.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.1.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.2.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.2.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.2.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.2.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.2.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.2.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.3.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.3.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.3.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.3.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.3.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.3.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.4.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.4.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.4.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.4.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.4.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.4.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.5.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.5.ls2 torch.Size([1024]) requires grad to False. +vision_model_freeze +=> set param external_feature_model.vit.decoder.layers.5.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.5.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.5.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.5.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.class_token torch.Size([1, 1, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.6.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.6.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.conv1.weight torch.Size([1024, 3, 14, 14]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.6.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.conv1.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.input_layernorm.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.position_embeddings.weight torch.Size([1025, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.0.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.0.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.6.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.input_layernorm.weight torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.6.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.input_layernorm.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.7.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.7.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.7.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.7.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.7.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.7.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.1.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.1.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.input_layernorm.weight torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.input_layernorm.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.8.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.8.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.8.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.8.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.1.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.1.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.8.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.8.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.2.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.2.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.2.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.2.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.9.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.9.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.9.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.9.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.2.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.2.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.9.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.3.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.3.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.3.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.3.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.10.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.10.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.10.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.10.input_layernorm.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.3.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.3.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.4.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.10.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.4.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.10.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.4.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.4.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.11.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.11.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.11.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.input_layernorm.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.11.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.11.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.5.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.5.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.5.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.5.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.ls1 torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.12.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.12.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.12.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.5.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.12.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.ls1 torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.6.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.6.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.6.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.13.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.ls2 torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.input_layernorm.weight torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.input_layernorm.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.6.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.6.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.13.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.13.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.7.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.7.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.7.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.7.input_layernorm.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.14.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.14.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.14.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.14.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.7.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.7.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.14.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.14.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.8.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.8.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.8.input_layernorm.weight torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.8.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.ls1 torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.ls2 torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.input_layernorm.weight torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.input_layernorm.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.8.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.8.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.15.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.15.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.9.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.9.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.9.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.9.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.16.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.16.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.16.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.16.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.9.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.16.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.16.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.10.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.10.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.10.input_layernorm.weight torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.10.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.17.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.17.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.17.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.17.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.10.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.10.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.11.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.11.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.11.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.11.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.17.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.17.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.18.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.18.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.18.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.input_layernorm.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.12.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.12.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.12.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.12.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.18.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.12.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.12.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.19.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.19.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.19.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.13.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.input_layernorm.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.13.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.13.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.13.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.19.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.13.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.13.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.20.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.20.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.20.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.ls1 torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.20.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.14.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.14.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.14.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.20.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.20.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.14.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.14.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.21.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.ls2 torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.21.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.15.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.21.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.15.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.15.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.input_layernorm.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.15.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.15.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.21.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.21.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.16.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.16.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.16.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.16.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.22.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.22.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.22.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.22.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.16.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.16.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.22.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.22.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.ls1 torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.17.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.17.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.17.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.23.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.23.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.input_layernorm.weight torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.23.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.23.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.23.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.17.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.17.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.18.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.18.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.18.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.18.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.18.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.18.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.19.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.19.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.19.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.19.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.19.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.19.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.20.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.20.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.20.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.20.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.20.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.20.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.21.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.21.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.21.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.21.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.21.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.21.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.22.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.22.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.22.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.22.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.22.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.22.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.23.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.23.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.23.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.23.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.23.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.23.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +model GPTVLModel( + (external_feature_model): MegatronVisionModel( + (vit): InternViTModel( + (conv1): Conv2d(3, 1024, kernel_size=(14, 14), stride=(14, 14)) + (position_embeddings): Embedding(1025, 1024) + (decoder): TransformerBlock( + (layers): ModuleList( + (0-23): 24 x InternViTTransformerLayer( + (input_layernorm): LayerNorm((1024,), eps=1e-06, elementwise_affine=True) + (self_attention): SelfAttention( + (core_attention): DotProductAttention( + (scale_mask_softmax): FusedScaleMaskSoftmax() + (attention_dropout): Dropout(p=0.0, inplace=False) + ) + (linear_proj): RowParallelLinear() + (linear_qkv): ColumnParallelLinear() + ) + (self_attn_bda): IdentityFuncOp() + (pre_cross_attn_layernorm): IdentityOp() + (cross_attention): IdentityOp() + (cross_attn_bda): IdentityFuncOp() + (pre_mlp_layernorm): LayerNorm((1024,), eps=1e-06, elementwise_affine=True) + (mlp): MLP( + (linear_fc1): ColumnParallelLinear() + (linear_fc2): RowParallelLinear() + ) + (mlp_bda): IdentityFuncOp() + ) + ) + ) + ) + (vision_projection): MultimodalProjector( + (encoder): MLP( + (linear_fc1): ColumnParallelLinear() + (linear_fc2): RowParallelLinear() + ) + ) + (pre_proj_layernorm): LayerNorm((4096,), eps=1e-05, elementwise_affine=True) + ) + (embedding): LanguageModelEmbedding( + (word_embeddings): VocabParallelEmbedding() + (embedding_dropout): Dropout(p=0.0, inplace=False) + ) + (rotary_pos_emb): RotaryEmbedding() + (decoder): TransformerBlock( + (layers): ModuleList( + (0-47): 48 x TransformerLayer( + (input_layernorm): RMSNorm() + (self_attention): SelfAttention( + (core_attention): DotProductAttention( + (scale_mask_softmax): FusedScaleMaskSoftmax() + (attention_dropout): Dropout(p=0.0, inplace=False) + ) + (linear_proj): RowParallelLinear() + (linear_qkv): ColumnParallelLinear() + (q_layernorm): IdentityOp() + (k_layernorm): IdentityOp() + ) + (pre_cross_attn_layernorm): IdentityOp() + (cross_attention): IdentityOp() + (cross_attn_bda): IdentityFuncOp() + (pre_mlp_layernorm): RMSNorm() + (mlp): MLP( + (linear_fc1): ColumnParallelLinear() + (linear_fc2): RowParallelLinear() + ) + ) + ) + (final_layernorm): RMSNorm() + ) + (output_layer): ColumnParallelLinear() +) +vision_model_freeze +=> set param external_feature_model.vit.class_token torch.Size([1, 1, 1024]) requires grad to False. +=> set param external_feature_model.vit.conv1.weight torch.Size([1024, 3, 14, 14]) requires grad to False. +=> set param external_feature_model.vit.conv1.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.position_embeddings.weight torch.Size([1025, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.0.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.0.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.0.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.0.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.0.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.0.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.1.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.1.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.1.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.1.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.1.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.1.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.2.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.2.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.2.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.2.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.2.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.2.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.3.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.3.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.3.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.3.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.3.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.3.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.4.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.4.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.4.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.4.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.4.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.4.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.5.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.5.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.5.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.5.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.5.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.5.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.6.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.6.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.6.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.6.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.6.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.6.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.7.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.7.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.7.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.7.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.7.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.7.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.8.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.8.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.8.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.8.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.8.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.8.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.9.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.9.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.9.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.9.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.9.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.9.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.10.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.10.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.10.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.10.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.10.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.10.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +model GPTVLModel( + (external_feature_model): MegatronVisionModel( + (vit): InternViTModel( + (conv1): Conv2d(3, 1024, kernel_size=(14, 14), stride=(14, 14)) + (position_embeddings): Embedding(1025, 1024) + (decoder): TransformerBlock( + (layers): ModuleList( + (0-23): 24 x InternViTTransformerLayer( + (input_layernorm): LayerNorm((1024,), eps=1e-06, elementwise_affine=True) + (self_attention): SelfAttention( + (core_attention): DotProductAttention( + (scale_mask_softmax): FusedScaleMaskSoftmax() + (attention_dropout): Dropout(p=0.0, inplace=False) + ) + (linear_proj): RowParallelLinear() + (linear_qkv): ColumnParallelLinear() + ) + (self_attn_bda): IdentityFuncOp() + (pre_cross_attn_layernorm): IdentityOp() + (cross_attention): IdentityOp() + (cross_attn_bda): IdentityFuncOp() + (pre_mlp_layernorm): LayerNorm((1024,), eps=1e-06, elementwise_affine=True) + (mlp): MLP( + (linear_fc1): ColumnParallelLinear() + (linear_fc2): RowParallelLinear() + ) + (mlp_bda): IdentityFuncOp() + ) + ) + ) + ) + (vision_projection): MultimodalProjector( + (encoder): MLP( + (linear_fc1): ColumnParallelLinear() + (linear_fc2): RowParallelLinear() + ) + ) + (pre_proj_layernorm): LayerNorm((4096,), eps=1e-05, elementwise_affine=True) + ) + (embedding): LanguageModelEmbedding( + (word_embeddings): VocabParallelEmbedding() + (embedding_dropout): Dropout(p=0.0, inplace=False) + ) + (rotary_pos_emb): RotaryEmbedding() + (decoder): TransformerBlock( + (layers): ModuleList( + (0-47): 48 x TransformerLayer( + (input_layernorm): RMSNorm() + (self_attention): SelfAttention( + (core_attention): DotProductAttention( + (scale_mask_softmax): FusedScaleMaskSoftmax() + (attention_dropout): Dropout(p=0.0, inplace=False) + ) + (linear_proj): RowParallelLinear() + (linear_qkv): ColumnParallelLinear() + (q_layernorm): IdentityOp() + (k_layernorm): IdentityOp() + ) + (pre_cross_attn_layernorm): IdentityOp() + (cross_attention): IdentityOp() + (cross_attn_bda): IdentityFuncOp() + (pre_mlp_layernorm): RMSNorm() + (mlp): MLP( + (linear_fc1): ColumnParallelLinear() + (linear_fc2): RowParallelLinear() + ) + ) + ) + (final_layernorm): RMSNorm() + ) + (output_layer): ColumnParallelLinear() +)=> set param external_feature_model.vit.decoder.layers.11.ls1 torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.11.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.11.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.11.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.11.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.11.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.12.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.12.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.12.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.12.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.12.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.12.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.13.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.13.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.13.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.13.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.13.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.13.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.14.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.14.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.14.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.14.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.14.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.14.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.15.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.15.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.15.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.15.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +model GPTVLModel( + (external_feature_model): MegatronVisionModel( + (vit): InternViTModel( + (conv1): Conv2d(3, 1024, kernel_size=(14, 14), stride=(14, 14)) + (position_embeddings): Embedding(1025, 1024) + (decoder): TransformerBlock( + (layers): ModuleList( + (0-23): 24 x InternViTTransformerLayer( + (input_layernorm): LayerNorm((1024,), eps=1e-06, elementwise_affine=True) + (self_attention): SelfAttention( + (core_attention): DotProductAttention( + (scale_mask_softmax): FusedScaleMaskSoftmax() + (attention_dropout): Dropout(p=0.0, inplace=False) + ) + (linear_proj): RowParallelLinear() + (linear_qkv): ColumnParallelLinear() + ) + (self_attn_bda): IdentityFuncOp() + (pre_cross_attn_layernorm): IdentityOp() + (cross_attention): IdentityOp() + (cross_attn_bda): IdentityFuncOp() + (pre_mlp_layernorm): LayerNorm((1024,), eps=1e-06, elementwise_affine=True) + (mlp): MLP( + (linear_fc1): ColumnParallelLinear() + (linear_fc2): RowParallelLinear() + ) + (mlp_bda): IdentityFuncOp() + ) + ) + ) + ) + (vision_projection): MultimodalProjector( + (encoder): MLP( + (linear_fc1): ColumnParallelLinear() + (linear_fc2): RowParallelLinear() + ) + ) + (pre_proj_layernorm): LayerNorm((4096,), eps=1e-05, elementwise_affine=True) + ) + (embedding): LanguageModelEmbedding( + (word_embeddings): VocabParallelEmbedding() + (embedding_dropout): Dropout(p=0.0, inplace=False) + ) + (rotary_pos_emb): RotaryEmbedding() + (decoder): TransformerBlock( + (layers): ModuleList( + (0-47): 48 x TransformerLayer( + (input_layernorm): RMSNorm() + (self_attention): SelfAttention( + (core_attention): DotProductAttention( + (scale_mask_softmax): FusedScaleMaskSoftmax() + (attention_dropout): Dropout(p=0.0, inplace=False) + ) + (linear_proj): RowParallelLinear() + (linear_qkv): ColumnParallelLinear() + (q_layernorm): IdentityOp() + (k_layernorm): IdentityOp() + ) + (pre_cross_attn_layernorm): IdentityOp() + (cross_attention): IdentityOp() + (cross_attn_bda): IdentityFuncOp() + (pre_mlp_layernorm): RMSNorm() + (mlp): MLP( + (linear_fc1): ColumnParallelLinear() + (linear_fc2): RowParallelLinear() + ) + ) + ) + (final_layernorm): RMSNorm() + ) + (output_layer): ColumnParallelLinear() +)=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.15.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.15.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.16.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.16.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.16.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.16.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.16.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.16.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.17.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.17.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.17.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.17.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.17.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.17.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.18.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.18.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.18.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.18.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.18.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.18.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.19.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.19.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.19.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.19.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.19.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.19.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.20.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.20.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.20.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.20.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.20.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.20.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.21.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.21.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.21.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.21.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.21.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.21.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.22.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.22.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.22.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.22.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.22.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.22.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.23.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.23.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.23.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.23.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.23.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.23.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +vision_model_freeze +=> set param external_feature_model.vit.class_token torch.Size([1, 1, 1024]) requires grad to False. +=> set param external_feature_model.vit.conv1.weight torch.Size([1024, 3, 14, 14]) requires grad to False. +=> set param external_feature_model.vit.conv1.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.position_embeddings.weight torch.Size([1025, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.0.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.0.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.0.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.0.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.0.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.0.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.1.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.1.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.1.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.1.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.1.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.1.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +model GPTVLModel( + (external_feature_model): MegatronVisionModel( + (vit): InternViTModel( + (conv1): Conv2d(3, 1024, kernel_size=(14, 14), stride=(14, 14)) + (position_embeddings): Embedding(1025, 1024) + (decoder): TransformerBlock( + (layers): ModuleList( + (0-23): 24 x InternViTTransformerLayer( + (input_layernorm): LayerNorm((1024,), eps=1e-06, elementwise_affine=True) + (self_attention): SelfAttention( + (core_attention): DotProductAttention( + (scale_mask_softmax): FusedScaleMaskSoftmax() + (attention_dropout): Dropout(p=0.0, inplace=False) + ) + (linear_proj): RowParallelLinear() + (linear_qkv): ColumnParallelLinear() + ) + (self_attn_bda): IdentityFuncOp() + (pre_cross_attn_layernorm): IdentityOp() + (cross_attention): IdentityOp() + (cross_attn_bda): IdentityFuncOp() + (pre_mlp_layernorm): LayerNorm((1024,), eps=1e-06, elementwise_affine=True) + (mlp): MLP( + (linear_fc1): ColumnParallelLinear() + (linear_fc2): RowParallelLinear() + ) + (mlp_bda): IdentityFuncOp() + ) + ) + ) + ) + (vision_projection): MultimodalProjector( + (encoder): MLP( + (linear_fc1): ColumnParallelLinear() + (linear_fc2): RowParallelLinear() + ) + ) + (pre_proj_layernorm): LayerNorm((4096,), eps=1e-05, elementwise_affine=True) + ) + (embedding): LanguageModelEmbedding( + (word_embeddings): VocabParallelEmbedding() + (embedding_dropout): Dropout(p=0.0, inplace=False) + ) + (rotary_pos_emb): RotaryEmbedding() + (decoder): TransformerBlock( + (layers): ModuleList( + (0-47): 48 x TransformerLayer( + (input_layernorm): RMSNorm() + (self_attention): SelfAttention( + (core_attention): DotProductAttention( + (scale_mask_softmax): FusedScaleMaskSoftmax() + (attention_dropout): Dropout(p=0.0, inplace=False) + ) + (linear_proj): RowParallelLinear() + (linear_qkv): ColumnParallelLinear() + (q_layernorm): IdentityOp() + (k_layernorm): IdentityOp() + ) + (pre_cross_attn_layernorm): IdentityOp() + (cross_attention): IdentityOp() + (cross_attn_bda): IdentityFuncOp() + (pre_mlp_layernorm): RMSNorm() + (mlp): MLP( + (linear_fc1): ColumnParallelLinear() + (linear_fc2): RowParallelLinear() + ) + ) + ) + (final_layernorm): RMSNorm() + ) + (output_layer): ColumnParallelLinear() +) +=> set param external_feature_model.vit.decoder.layers.2.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.2.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.2.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.2.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.2.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.2.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.3.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.3.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.3.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.3.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.3.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.3.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.4.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.4.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.4.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.4.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.4.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.4.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.5.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.5.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.5.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.5.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.5.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.5.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.6.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.6.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.6.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.6.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.6.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.6.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.7.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.7.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.7.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.7.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.7.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.7.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.8.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.8.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.8.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.8.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.8.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.8.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.9.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.9.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.9.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.9.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.9.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.9.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.10.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.10.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.10.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.10.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.10.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.10.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.11.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.11.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.11.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.11.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.11.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.11.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.12.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.12.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.12.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.12.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.12.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.12.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.13.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.13.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.13.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.13.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.13.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.13.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.14.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.14.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.14.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.14.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.14.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.14.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.15.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.15.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.15.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.15.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.15.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.15.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.16.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.16.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.16.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.16.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.16.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.16.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.17.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.17.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.17.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.17.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.17.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.17.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.18.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.18.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.18.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.18.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.18.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.18.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.19.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.19.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.19.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.19.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.19.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.19.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.20.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.20.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.20.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.20.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.20.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.20.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.21.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.21.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.21.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.21.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.21.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.21.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.22.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.22.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.22.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.22.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.22.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.22.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.23.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.23.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.23.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.23.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.23.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.23.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +vision_model_freeze +=> set param external_feature_model.vit.class_token torch.Size([1, 1, 1024]) requires grad to False. +=> set param external_feature_model.vit.conv1.weight torch.Size([1024, 3, 14, 14]) requires grad to False. +=> set param external_feature_model.vit.conv1.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.position_embeddings.weight torch.Size([1025, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.0.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.0.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.0.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.0.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.0.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.0.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.1.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.1.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.1.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.1.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.1.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.1.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.2.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.2.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.2.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.2.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.2.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.2.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.3.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.3.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.3.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.3.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.3.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.3.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.4.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.4.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.4.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.4.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.4.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.4.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.5.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.5.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.5.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.5.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.5.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.5.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.6.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.6.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.6.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.6.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.6.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.6.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.7.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.7.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.7.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.7.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.7.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.7.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.8.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.8.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.8.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.8.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.8.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.8.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.9.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.9.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.9.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.9.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.9.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.9.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.10.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.10.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.10.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.10.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.10.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.10.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.11.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.11.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.11.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.11.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.11.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.11.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.12.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.12.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.12.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.12.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.12.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.12.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.13.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.13.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.13.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.13.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.13.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.13.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.14.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.14.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.14.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.14.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.14.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.14.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.15.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.15.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.15.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.15.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.15.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.15.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.16.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.16.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.16.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.16.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.16.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.16.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.17.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.17.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.17.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.17.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.17.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.17.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.18.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.18.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.18.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.18.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.18.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.18.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.19.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.19.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.19.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.19.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.19.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.19.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.20.ls1 torch.Size([1024]) requires grad to False. +vision_model_freezemodel GPTVLModel( + (external_feature_model): MegatronVisionModel( + (vit): InternViTModel( + (conv1): Conv2d(3, 1024, kernel_size=(14, 14), stride=(14, 14)) + (position_embeddings): Embedding(1025, 1024) + (decoder): TransformerBlock( + (layers): ModuleList( + (0-23): 24 x InternViTTransformerLayer( + (input_layernorm): LayerNorm((1024,), eps=1e-06, elementwise_affine=True) + (self_attention): SelfAttention( + (core_attention): DotProductAttention( + (scale_mask_softmax): FusedScaleMaskSoftmax() + (attention_dropout): Dropout(p=0.0, inplace=False) + ) + (linear_proj): RowParallelLinear() + (linear_qkv): ColumnParallelLinear() + ) + (self_attn_bda): IdentityFuncOp() + (pre_cross_attn_layernorm): IdentityOp() + (cross_attention): IdentityOp() + (cross_attn_bda): IdentityFuncOp() + (pre_mlp_layernorm): LayerNorm((1024,), eps=1e-06, elementwise_affine=True) + (mlp): MLP( + (linear_fc1): ColumnParallelLinear() + (linear_fc2): RowParallelLinear() + ) + (mlp_bda): IdentityFuncOp() + ) + ) + ) + ) + (vision_projection): MultimodalProjector( + (encoder): MLP( + (linear_fc1): ColumnParallelLinear() + (linear_fc2): RowParallelLinear() + ) + ) + (pre_proj_layernorm): LayerNorm((4096,), eps=1e-05, elementwise_affine=True) + ) + (embedding): LanguageModelEmbedding( + (word_embeddings): VocabParallelEmbedding() + (embedding_dropout): Dropout(p=0.0, inplace=False) + ) + (rotary_pos_emb): RotaryEmbedding() + (decoder): TransformerBlock( + (layers): ModuleList( + (0-47): 48 x TransformerLayer( + (input_layernorm): RMSNorm() + (self_attention): SelfAttention( + (core_attention): DotProductAttention( + (scale_mask_softmax): FusedScaleMaskSoftmax() + (attention_dropout): Dropout(p=0.0, inplace=False) + ) + (linear_proj): RowParallelLinear() + (linear_qkv): ColumnParallelLinear() + (q_layernorm): IdentityOp() + (k_layernorm): IdentityOp() + ) + (pre_cross_attn_layernorm): IdentityOp() + (cross_attention): IdentityOp() + (cross_attn_bda): IdentityFuncOp() + (pre_mlp_layernorm): RMSNorm() + (mlp): MLP( + (linear_fc1): ColumnParallelLinear() + (linear_fc2): RowParallelLinear() + ) + ) + ) + (final_layernorm): RMSNorm() + ) + (output_layer): ColumnParallelLinear() +)=> set param external_feature_model.vit.decoder.layers.20.ls2 torch.Size([1024]) requires grad to False. + + +=> set param external_feature_model.vit.decoder.layers.20.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.20.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.20.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.20.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.class_token torch.Size([1, 1, 1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.21.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.21.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.conv1.weight torch.Size([1024, 3, 14, 14]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.21.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.conv1.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.21.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.position_embeddings.weight torch.Size([1025, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.0.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.0.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.0.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.0.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.21.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.21.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.0.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.0.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.ls1 torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.22.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.22.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.1.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.22.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.1.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.1.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.1.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.22.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.1.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.23.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.ls2 torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.23.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.2.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.input_layernorm.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.2.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.2.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.2.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.23.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.23.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.2.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.3.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.3.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.3.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.3.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.3.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.3.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.4.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.4.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.4.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.4.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.4.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.4.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.5.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.5.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.5.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.5.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.5.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.5.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.6.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.6.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.6.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.6.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.6.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.6.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.7.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.7.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.7.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.7.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.7.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.7.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.8.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.8.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.8.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.8.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.8.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.8.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.9.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.9.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.9.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.9.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.9.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.9.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.10.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.10.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.10.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.10.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.10.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.10.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.11.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.11.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.11.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.11.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.11.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.11.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.12.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.12.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.12.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.12.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.12.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.12.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.13.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.13.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.13.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.13.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.13.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.13.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.14.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.14.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.14.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.14.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.14.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.14.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.15.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.15.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.15.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.15.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.15.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.15.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.16.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.16.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.16.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.16.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.16.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.16.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.17.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.17.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.17.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.17.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.17.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.17.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.18.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.18.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.18.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.18.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.18.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.18.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.19.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.19.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.19.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.19.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.19.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.19.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.20.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.20.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.20.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.20.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.20.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.20.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.21.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.21.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.21.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.21.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.21.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.21.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.22.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.22.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.22.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.22.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.22.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.22.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.23.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.23.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.23.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.23.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.23.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.23.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +model GPTVLModel( + (external_feature_model): MegatronVisionModel( + (vit): InternViTModel( + (conv1): Conv2d(3, 1024, kernel_size=(14, 14), stride=(14, 14)) + (position_embeddings): Embedding(1025, 1024) + (decoder): TransformerBlock( + (layers): ModuleList( + (0-23): 24 x InternViTTransformerLayer( + (input_layernorm): LayerNorm((1024,), eps=1e-06, elementwise_affine=True) + (self_attention): SelfAttention( + (core_attention): DotProductAttention( + (scale_mask_softmax): FusedScaleMaskSoftmax() + (attention_dropout): Dropout(p=0.0, inplace=False) + ) + (linear_proj): RowParallelLinear() + (linear_qkv): ColumnParallelLinear() + ) + (self_attn_bda): IdentityFuncOp() + (pre_cross_attn_layernorm): IdentityOp() + (cross_attention): IdentityOp() + (cross_attn_bda): IdentityFuncOp() + (pre_mlp_layernorm): LayerNorm((1024,), eps=1e-06, elementwise_affine=True) + (mlp): MLP( + (linear_fc1): ColumnParallelLinear() + (linear_fc2): RowParallelLinear() + ) + (mlp_bda): IdentityFuncOp() + ) + ) + ) + ) + (vision_projection): MultimodalProjector( + (encoder): MLP( + (linear_fc1): ColumnParallelLinear() + (linear_fc2): RowParallelLinear() + ) + ) + (pre_proj_layernorm): LayerNorm((4096,), eps=1e-05, elementwise_affine=True) + ) + (embedding): LanguageModelEmbedding( + (word_embeddings): VocabParallelEmbedding() + (embedding_dropout): Dropout(p=0.0, inplace=False) + ) + (rotary_pos_emb): RotaryEmbedding() + (decoder): TransformerBlock( + (layers): ModuleList( + (0-47): 48 x TransformerLayer( + (input_layernorm): RMSNorm() + (self_attention): SelfAttention( + (core_attention): DotProductAttention( + (scale_mask_softmax): FusedScaleMaskSoftmax() + (attention_dropout): Dropout(p=0.0, inplace=False) + ) + (linear_proj): RowParallelLinear() + (linear_qkv): ColumnParallelLinear() + (q_layernorm): IdentityOp() + (k_layernorm): IdentityOp() + ) + (pre_cross_attn_layernorm): IdentityOp() + (cross_attention): IdentityOp() + (cross_attn_bda): IdentityFuncOp() + (pre_mlp_layernorm): RMSNorm() + (mlp): MLP( + (linear_fc1): ColumnParallelLinear() + (linear_fc2): RowParallelLinear() + ) + ) + ) + (final_layernorm): RMSNorm() + ) + (output_layer): ColumnParallelLinear() +) +model GPTVLModel( + (external_feature_model): MegatronVisionModel( + (vit): InternViTModel( + (conv1): Conv2d(3, 1024, kernel_size=(14, 14), stride=(14, 14)) + (position_embeddings): Embedding(1025, 1024) + (decoder): TransformerBlock( + (layers): ModuleList( + (0-23): 24 x InternViTTransformerLayer( + (input_layernorm): LayerNorm((1024,), eps=1e-06, elementwise_affine=True) + (self_attention): SelfAttention( + (core_attention): DotProductAttention( + (scale_mask_softmax): FusedScaleMaskSoftmax() + (attention_dropout): Dropout(p=0.0, inplace=False) + ) + (linear_proj): RowParallelLinear() + (linear_qkv): ColumnParallelLinear() + ) + (self_attn_bda): IdentityFuncOp() + (pre_cross_attn_layernorm): IdentityOp() + (cross_attention): IdentityOp() + (cross_attn_bda): IdentityFuncOp() + (pre_mlp_layernorm): LayerNorm((1024,), eps=1e-06, elementwise_affine=True) + (mlp): MLP( + (linear_fc1): ColumnParallelLinear() + (linear_fc2): RowParallelLinear() + ) + (mlp_bda): IdentityFuncOp() + ) + ) + ) + ) + (vision_projection): MultimodalProjector( + (encoder): MLP( + (linear_fc1): ColumnParallelLinear() + (linear_fc2): RowParallelLinear() + ) + ) + (pre_proj_layernorm): LayerNorm((4096,), eps=1e-05, elementwise_affine=True) + ) + (embedding): LanguageModelEmbedding( + (word_embeddings): VocabParallelEmbedding() + (embedding_dropout): Dropout(p=0.0, inplace=False) + ) + (rotary_pos_emb): RotaryEmbedding() + (decoder): TransformerBlock( + (layers): ModuleList( + (0-47): 48 x TransformerLayer( + (input_layernorm): RMSNorm() + (self_attention): SelfAttention( + (core_attention): DotProductAttention( + (scale_mask_softmax): FusedScaleMaskSoftmax() + (attention_dropout): Dropout(p=0.0, inplace=False) + ) + (linear_proj): RowParallelLinear() + (linear_qkv): ColumnParallelLinear() + (q_layernorm): IdentityOp() + (k_layernorm): IdentityOp() + ) + (pre_cross_attn_layernorm): IdentityOp() + (cross_attention): IdentityOp() + (cross_attn_bda): IdentityFuncOp() + (pre_mlp_layernorm): RMSNorm() + (mlp): MLP( + (linear_fc1): ColumnParallelLinear() + (linear_fc2): RowParallelLinear() + ) + ) + ) + (final_layernorm): RMSNorm() + ) + (output_layer): ColumnParallelLinear() +) +vision_model_freeze +=> set param external_feature_model.vit.class_token torch.Size([1, 1, 1024]) requires grad to False. +=> set param external_feature_model.vit.conv1.weight torch.Size([1024, 3, 14, 14]) requires grad to False. +=> set param external_feature_model.vit.conv1.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.position_embeddings.weight torch.Size([1025, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.0.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.0.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.0.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.0.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.0.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.0.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.1.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.1.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.1.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.1.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.1.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.1.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.2.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.2.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.2.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.2.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.2.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.2.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.3.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.3.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.3.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.3.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.3.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.3.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.4.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.4.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.4.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.4.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.4.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.4.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.5.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.5.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.5.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.5.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.5.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.5.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.6.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.6.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.6.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.6.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.6.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.6.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.7.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.7.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.7.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.7.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.7.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.7.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.8.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.8.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.8.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.8.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.8.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.8.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.9.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.9.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.9.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.9.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.9.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.9.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.10.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.10.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.10.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.10.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.10.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.10.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.11.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.11.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.11.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.11.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.11.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.11.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.12.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.12.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.12.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.12.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.12.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.12.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.13.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.13.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.13.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.13.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.13.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.13.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.14.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.14.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.14.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.14.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.14.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.14.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.15.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.15.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.15.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.15.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.15.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.15.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.16.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.16.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.16.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.16.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.16.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.16.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.17.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.17.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.17.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.17.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.17.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.17.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.18.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.18.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.18.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.18.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.18.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.18.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.19.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.19.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.19.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.19.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.19.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.19.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.20.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.20.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.20.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.20.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.20.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.20.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.21.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.21.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.21.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.21.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.21.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.21.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.22.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.22.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.22.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.22.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.22.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.22.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.23.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.23.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.23.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.23.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.23.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.23.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +model GPTVLModel( + (external_feature_model): MegatronVisionModel( + (vit): InternViTModel( + (conv1): Conv2d(3, 1024, kernel_size=(14, 14), stride=(14, 14)) + (position_embeddings): Embedding(1025, 1024) + (decoder): TransformerBlock( + (layers): ModuleList( + (0-23): 24 x InternViTTransformerLayer( + (input_layernorm): LayerNorm((1024,), eps=1e-06, elementwise_affine=True) + (self_attention): SelfAttention( + (core_attention): DotProductAttention( + (scale_mask_softmax): FusedScaleMaskSoftmax() + (attention_dropout): Dropout(p=0.0, inplace=False) + ) + (linear_proj): RowParallelLinear() + (linear_qkv): ColumnParallelLinear() + ) + (self_attn_bda): IdentityFuncOp() + (pre_cross_attn_layernorm): IdentityOp() + (cross_attention): IdentityOp() + (cross_attn_bda): IdentityFuncOp() + (pre_mlp_layernorm): LayerNorm((1024,), eps=1e-06, elementwise_affine=True) + (mlp): MLP( + (linear_fc1): ColumnParallelLinear() + (linear_fc2): RowParallelLinear() + ) + (mlp_bda): IdentityFuncOp() + ) + ) + ) + ) + (vision_projection): MultimodalProjector( + (encoder): MLP( + (linear_fc1): ColumnParallelLinear() + (linear_fc2): RowParallelLinear() + ) + ) + (pre_proj_layernorm): LayerNorm((4096,), eps=1e-05, elementwise_affine=True) + ) + (embedding): LanguageModelEmbedding( + (word_embeddings): VocabParallelEmbedding() + (embedding_dropout): Dropout(p=0.0, inplace=False) + ) + (rotary_pos_emb): RotaryEmbedding() + (decoder): TransformerBlock( + (layers): ModuleList( + (0-47): 48 x TransformerLayer( + (input_layernorm): RMSNorm() + (self_attention): SelfAttention( + (core_attention): DotProductAttention( + (scale_mask_softmax): FusedScaleMaskSoftmax() + (attention_dropout): Dropout(p=0.0, inplace=False) + ) + (linear_proj): RowParallelLinear() + (linear_qkv): ColumnParallelLinear() + (q_layernorm): IdentityOp() + (k_layernorm): IdentityOp() + ) + (pre_cross_attn_layernorm): IdentityOp() + (cross_attention): IdentityOp() + (cross_attn_bda): IdentityFuncOp() + (pre_mlp_layernorm): RMSNorm() + (mlp): MLP( + (linear_fc1): ColumnParallelLinear() + (linear_fc2): RowParallelLinear() + ) + ) + ) + (final_layernorm): RMSNorm() + ) + (output_layer): ColumnParallelLinear() +) +vision_model_freezevision_model_freeze + +=> set param external_feature_model.vit.class_token torch.Size([1, 1, 1024]) requires grad to False.=> set param external_feature_model.vit.class_token torch.Size([1, 1, 1024]) requires grad to False. + +=> set param external_feature_model.vit.conv1.weight torch.Size([1024, 3, 14, 14]) requires grad to False.=> set param external_feature_model.vit.conv1.weight torch.Size([1024, 3, 14, 14]) requires grad to False. + +=> set param external_feature_model.vit.conv1.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.conv1.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.position_embeddings.weight torch.Size([1025, 1024]) requires grad to False.=> set param external_feature_model.vit.position_embeddings.weight torch.Size([1025, 1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.0.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.ls1 torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.0.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.ls2 torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.0.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.0.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.0.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.input_layernorm.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.0.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.0.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.0.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.0.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.1.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.ls1 torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.1.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.ls2 torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.1.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.input_layernorm.weight torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.1.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.input_layernorm.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.1.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.1.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.2.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.ls1 torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.2.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.ls2 torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.2.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.input_layernorm.weight torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.2.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.input_layernorm.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.2.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.2.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.3.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.ls1 torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.3.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.ls2 torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.3.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.input_layernorm.weight torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.3.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.input_layernorm.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.3.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.3.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.4.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.4.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.4.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.4.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.4.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.input_layernorm.weight torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.4.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.input_layernorm.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.4.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.4.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.5.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.ls1 torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.5.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.ls2 torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.5.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.input_layernorm.weight torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.5.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.input_layernorm.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.5.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.5.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.6.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.ls1 torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.6.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.ls2 torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.6.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.input_layernorm.weight torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.6.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.input_layernorm.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.6.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.6.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.7.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.7.ls1 torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.7.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.7.ls2 torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.7.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.7.input_layernorm.weight torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.7.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.7.input_layernorm.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.7.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.7.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.7.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.7.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.8.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.8.ls1 torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.8.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.8.ls2 torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.8.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.8.input_layernorm.weight torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.8.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.8.input_layernorm.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.8.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.8.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.8.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.8.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.9.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.ls1 torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.9.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.ls2 torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.9.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.input_layernorm.weight torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.9.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.input_layernorm.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.9.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.9.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.10.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.10.ls1 torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.10.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.10.ls2 torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.10.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.10.input_layernorm.weight torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.10.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.10.input_layernorm.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.10.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.10.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.10.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.10.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.11.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.ls1 torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.11.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.ls2 torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.11.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.input_layernorm.weight torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.11.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.input_layernorm.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.11.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.11.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.12.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.ls1 torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.12.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.ls2 torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.12.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.input_layernorm.weight torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.12.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.input_layernorm.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.12.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.12.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.13.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.ls1 torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.13.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.ls2 torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.13.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.input_layernorm.weight torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.13.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.input_layernorm.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.13.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.13.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.14.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.ls1 torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.14.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.ls2 torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.14.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.input_layernorm.weight torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.14.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.input_layernorm.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.14.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.14.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.15.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.ls1 torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.15.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.ls2 torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.15.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.input_layernorm.weight torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.15.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.input_layernorm.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.15.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.15.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.16.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.ls1 torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.16.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.ls2 torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.16.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.input_layernorm.weight torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.16.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.input_layernorm.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.16.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.16.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.17.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.ls1 torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.17.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.ls2 torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.17.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.input_layernorm.weight torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.17.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.17.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.17.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.17.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.17.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.17.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.18.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.18.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.18.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.18.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.ls2 torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.18.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.18.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.18.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.18.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.18.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.18.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.18.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.19.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.19.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.ls1 torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.19.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.19.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.19.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.input_layernorm.weight torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.19.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.19.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.19.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.19.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.19.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.20.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.20.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.20.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.20.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.20.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.20.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.20.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.20.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.20.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.21.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.21.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.21.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.21.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.21.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.21.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.21.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.input_layernorm.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.21.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.21.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.21.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.ls1 torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.ls2 torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.22.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.22.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.22.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.22.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.22.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.22.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.22.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.22.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.23.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.23.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.23.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.23.ls1 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.23.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.23.ls2 torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.23.input_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.23.input_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.23.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.23.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. + +=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. +=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. +model GPTVLModel( + (external_feature_model): MegatronVisionModel( + (vit): InternViTModel( + (conv1): Conv2d(3, 1024, kernel_size=(14, 14), stride=(14, 14)) + (position_embeddings): Embedding(1025, 1024) + (decoder): TransformerBlock( + (layers): ModuleList( + (0-23): 24 x InternViTTransformerLayer( + (input_layernorm): LayerNorm((1024,), eps=1e-06, elementwise_affine=True) + (self_attention): SelfAttention( + (core_attention): DotProductAttention( + (scale_mask_softmax): FusedScaleMaskSoftmax() + (attention_dropout): Dropout(p=0.0, inplace=False) + ) + (linear_proj): RowParallelLinear() + (linear_qkv): ColumnParallelLinear() + ) + (self_attn_bda): IdentityFuncOp() + (pre_cross_attn_layernorm): IdentityOp() + (cross_attention): IdentityOp() + (cross_attn_bda): IdentityFuncOp() + (pre_mlp_layernorm): LayerNorm((1024,), eps=1e-06, elementwise_affine=True) + (mlp): MLP( + (linear_fc1): ColumnParallelLinear() + (linear_fc2): RowParallelLinear() + ) + (mlp_bda): IdentityFuncOp() + ) + ) + ) + ) + (vision_projection): MultimodalProjector( + (encoder): MLP( + (linear_fc1): ColumnParallelLinear() + (linear_fc2): RowParallelLinear() + ) + ) + (pre_proj_layernorm): LayerNorm((4096,), eps=1e-05, elementwise_affine=True) + ) + (embedding): LanguageModelEmbedding( + (word_embeddings): VocabParallelEmbedding() + (embedding_dropout): Dropout(p=0.0, inplace=False) + ) + (rotary_pos_emb): RotaryEmbedding() + (decoder): TransformerBlock( + (layers): ModuleList( + (0-47): 48 x TransformerLayer( + (input_layernorm): RMSNorm() + (self_attention): SelfAttention( + (core_attention): DotProductAttention( + (scale_mask_softmax): FusedScaleMaskSoftmax() + (attention_dropout): Dropout(p=0.0, inplace=False) + ) + (linear_proj): RowParallelLinear() + (linear_qkv): ColumnParallelLinear() + (q_layernorm): IdentityOp() + (k_layernorm): IdentityOp() + ) + (pre_cross_attn_layernorm): IdentityOp() + (cross_attention): IdentityOp() + (cross_attn_bda): IdentityFuncOp() + (pre_mlp_layernorm): RMSNorm() + (mlp): MLP( + (linear_fc1): ColumnParallelLinear() + (linear_fc2): RowParallelLinear() + ) + ) + ) + (final_layernorm): RMSNorm() + ) + (output_layer): ColumnParallelLinear() +) +model GPTVLModel( + (external_feature_model): MegatronVisionModel( + (vit): InternViTModel( + (conv1): Conv2d(3, 1024, kernel_size=(14, 14), stride=(14, 14)) + (position_embeddings): Embedding(1025, 1024) + (decoder): TransformerBlock( + (layers): ModuleList( + (0-23): 24 x InternViTTransformerLayer( + (input_layernorm): LayerNorm((1024,), eps=1e-06, elementwise_affine=True) + (self_attention): SelfAttention( + (core_attention): DotProductAttention( + (scale_mask_softmax): FusedScaleMaskSoftmax() + (attention_dropout): Dropout(p=0.0, inplace=False) + ) + (linear_proj): RowParallelLinear() + (linear_qkv): ColumnParallelLinear() + ) + (self_attn_bda): IdentityFuncOp() + (pre_cross_attn_layernorm): IdentityOp() + (cross_attention): IdentityOp() + (cross_attn_bda): IdentityFuncOp() + (pre_mlp_layernorm): LayerNorm((1024,), eps=1e-06, elementwise_affine=True) + (mlp): MLP( + (linear_fc1): ColumnParallelLinear() + (linear_fc2): RowParallelLinear() + ) + (mlp_bda): IdentityFuncOp() + ) + ) + ) + ) + (vision_projection): MultimodalProjector( + (encoder): MLP( + (linear_fc1): ColumnParallelLinear() + (linear_fc2): RowParallelLinear() + ) + ) + (pre_proj_layernorm): LayerNorm((4096,), eps=1e-05, elementwise_affine=True) + ) + (embedding): LanguageModelEmbedding( + (word_embeddings): VocabParallelEmbedding() + (embedding_dropout): Dropout(p=0.0, inplace=False) + ) + (rotary_pos_emb): RotaryEmbedding() + (decoder): TransformerBlock( + (layers): ModuleList( + (0-47): 48 x TransformerLayer( + (input_layernorm): RMSNorm() + (self_attention): SelfAttention( + (core_attention): DotProductAttention( + (scale_mask_softmax): FusedScaleMaskSoftmax() + (attention_dropout): Dropout(p=0.0, inplace=False) + ) + (linear_proj): RowParallelLinear() + (linear_qkv): ColumnParallelLinear() + (q_layernorm): IdentityOp() + (k_layernorm): IdentityOp() + ) + (pre_cross_attn_layernorm): IdentityOp() + (cross_attention): IdentityOp() + (cross_attn_bda): IdentityFuncOp() + (pre_mlp_layernorm): RMSNorm() + (mlp): MLP( + (linear_fc1): ColumnParallelLinear() + (linear_fc2): RowParallelLinear() + ) + ) + ) + (final_layernorm): RMSNorm() + ) + (output_layer): ColumnParallelLinear() +) +_get_param_groups name module.module.external_feature_model.vision_projection.encoder.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.external_feature_model.vision_projection.encoder.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.external_feature_model.pre_proj_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.external_feature_model.pre_proj_layernorm.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.embedding.word_embeddings.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.0.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.0.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.0.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.0.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.0.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.0.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.0.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.1.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.1.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.1.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.1.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.1.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.1.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.1.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.2.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.2.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.2.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.2.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.2.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.2.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.2.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.3.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.3.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.3.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.3.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.external_feature_model.vision_projection.encoder.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.3.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.3.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.3.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.4.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.4.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.4.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.4.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.4.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.4.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.4.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.external_feature_model.vision_projection.encoder.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.5.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.5.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.external_feature_model.pre_proj_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.5.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.5.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.external_feature_model.pre_proj_layernorm.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.5.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.embedding.word_embeddings.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.5.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.5.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.0.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.6.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.0.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.6.self_attention.linear_proj.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.6.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.0.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.6.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.0.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.6.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.0.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.6.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.0.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.6.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.0.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.7.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.1.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.7.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.7.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.1.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.7.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.1.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.1.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.7.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.1.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.7.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.7.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.1.mlp.linear_fc1.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.1.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.8.input_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.2.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.8.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.8.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.2.self_attention.linear_proj.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.8.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.2.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.2.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.8.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.8.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.2.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.8.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.2.mlp.linear_fc1.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.2.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.9.input_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.3.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.9.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.9.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.3.self_attention.linear_proj.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.9.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.3.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.3.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.9.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.9.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.3.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.9.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.3.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.10.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.3.mlp.linear_fc2.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.4.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.10.self_attention.linear_proj.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.10.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.4.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.10.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.4.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.10.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.4.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.10.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.4.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.10.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.4.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.11.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.4.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.11.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.5.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.11.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.11.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.5.self_attention.linear_proj.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.5.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.11.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.5.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.11.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.5.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.11.mlp.linear_fc2.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.5.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.12.input_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.5.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.12.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.6.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.12.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.12.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.6.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.12.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.6.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.6.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.12.mlp.linear_fc1.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.12.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.6.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.13.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.6.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.13.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.6.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.13.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.7.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.13.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.7.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.13.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.7.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.13.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.7.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.13.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.7.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.14.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.7.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.14.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.7.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.14.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.8.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.14.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.8.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.14.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.8.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.14.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.8.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.14.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.8.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.15.input_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.8.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.15.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.8.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.15.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.9.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.15.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.9.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.15.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.9.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.15.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.9.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.15.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.16.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.9.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.9.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.16.self_attention.linear_proj.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.9.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.16.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.16.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.10.input_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.16.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.10.self_attention.linear_proj.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.10.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.16.mlp.linear_fc1.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.external_feature_model.vision_projection.encoder.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.10.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.16.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.17.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.10.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.10.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.17.self_attention.linear_proj.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.10.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.17.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.17.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.11.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.17.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.11.self_attention.linear_proj.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.17.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.11.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.17.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.11.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.18.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.external_feature_model.vision_projection.encoder.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.11.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.18.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.11.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.external_feature_model.vision_projection.encoder.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.18.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.11.mlp.linear_fc2.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.18.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.12.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.external_feature_model.pre_proj_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.18.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.12.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.external_feature_model.pre_proj_layernorm.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.18.mlp.linear_fc1.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.12.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.18.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.12.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.embedding.word_embeddings.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.19.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.12.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.19.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.0.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.12.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.19.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.12.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.19.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.0.self_attention.linear_proj.weight key (1.0, 1.0, False, False) + + +_get_param_groups name module.module.external_feature_model.vision_projection.encoder.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.13.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.19.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.0.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.13.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.0.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.19.mlp.linear_fc1.weight key (1.0, 1.0, False, False) + + +_get_param_groups name module.module.external_feature_model.pre_proj_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.13.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.19.mlp.linear_fc2.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.external_feature_model.pre_proj_layernorm.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.0.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.13.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.20.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.0.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.embedding.word_embeddings.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.13.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.20.self_attention.linear_proj.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.0.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.20.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.13.mlp.linear_fc1.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.0.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.20.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.1.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.13.mlp.linear_fc2.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.0.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.20.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.14.input_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.1.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.0.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.20.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.1.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.14.self_attention.linear_proj.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.0.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.20.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.1.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.14.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.14.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.21.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.0.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.1.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.14.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.21.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.0.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.1.mlp.linear_fc1.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.14.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.21.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.0.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.1.mlp.linear_fc2.weight key (1.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.14.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.21.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.1.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.2.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.15.input_layernorm.weight key (0.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.21.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.15.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.1.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.2.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.21.mlp.linear_fc1.weight key (1.0, 1.0, False, False) + + + +_get_param_groups name module.module.decoder.layers.21.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.1.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.2.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.15.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) + + + +_get_param_groups name module.module.decoder.layers.15.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.22.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.2.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.1.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) + + + +_get_param_groups name module.module.decoder.layers.22.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.15.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.2.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.1.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) + + + +_get_param_groups name module.module.decoder.layers.22.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.15.mlp.linear_fc1.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.1.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.2.mlp.linear_fc1.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.22.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.15.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.1.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.2.mlp.linear_fc2.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.16.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.22.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.2.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.3.input_layernorm.weight key (0.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.16.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.22.mlp.linear_fc1.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.2.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.3.self_attention.linear_proj.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.22.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.16.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.3.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.2.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.16.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.23.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.3.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.2.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.16.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.23.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.3.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.16.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.23.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.2.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.3.mlp.linear_fc1.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.23.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.16.mlp.linear_fc2.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.3.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.2.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.17.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.23.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.4.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.2.mlp.linear_fc2.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.17.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.23.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.3.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.4.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.17.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.23.mlp.linear_fc2.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.4.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.3.self_attention.linear_proj.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.17.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.24.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.4.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.3.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.17.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.24.self_attention.linear_proj.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.3.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.17.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.24.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.4.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.3.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.17.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.24.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.4.mlp.linear_fc1.weight key (1.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.3.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.4.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.18.input_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.24.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.3.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.5.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.24.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.18.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.4.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.24.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.18.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.5.self_attention.linear_proj.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.25.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.18.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.4.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.5.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.25.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.5.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.4.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.18.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) + + + +_get_param_groups name module.module.decoder.layers.25.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.4.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.18.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.5.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.25.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.18.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.4.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.5.mlp.linear_fc1.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.19.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.25.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.5.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.4.mlp.linear_fc1.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.25.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.19.self_attention.linear_proj.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.6.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.4.mlp.linear_fc2.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.25.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.19.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.5.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.6.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.19.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.26.input_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.6.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.5.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.6.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.19.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.5.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.26.self_attention.linear_proj.weight key (1.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.19.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.5.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.6.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.26.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) + + + +_get_param_groups name module.module.decoder.layers.19.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.26.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.6.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.5.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.20.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.6.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.26.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.5.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.7.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.26.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.20.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.5.mlp.linear_fc2.weight key (1.0, 1.0, False, False) + + + +_get_param_groups name module.module.decoder.layers.26.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.20.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.6.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.7.self_attention.linear_proj.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.20.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.27.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.7.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.6.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.20.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.7.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.27.self_attention.linear_proj.weight key (1.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.6.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.27.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.6.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.20.mlp.linear_fc1.weight key (1.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.7.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.27.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.20.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.6.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.7.mlp.linear_fc1.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.27.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.21.input_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.7.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.6.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.27.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.8.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.6.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.21.self_attention.linear_proj.weight key (1.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.27.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.21.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.7.input_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.8.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.28.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.21.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.8.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.7.self_attention.linear_proj.weight key (1.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.28.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.8.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.7.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.28.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.21.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.7.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.28.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.8.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.21.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.21.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.8.mlp.linear_fc1.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.28.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.7.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.8.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.22.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.28.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.7.mlp.linear_fc1.weight key (1.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.9.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.28.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.22.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.7.mlp.linear_fc2.weight key (1.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.22.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.9.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.29.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.8.input_layernorm.weight key (0.0, 1.0, False, False) + + + +_get_param_groups name module.module.decoder.layers.22.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.9.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.29.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.8.self_attention.linear_proj.weight key (1.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.22.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.29.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.8.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.9.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.29.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.8.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.22.mlp.linear_fc1.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.9.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.22.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.29.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.9.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.8.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.23.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.9.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.29.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.8.mlp.linear_fc1.weight key (1.0, 1.0, False, False) + + + +_get_param_groups name module.module.decoder.layers.29.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.8.mlp.linear_fc2.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.10.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.23.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.30.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.9.input_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.23.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.10.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.23.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.9.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.10.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.30.self_attention.linear_proj.weight key (1.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.10.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.9.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.30.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.23.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) + + + +_get_param_groups name module.module.decoder.layers.30.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.9.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.23.mlp.linear_fc1.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.10.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.23.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.10.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.30.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.9.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.24.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.10.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.30.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.9.mlp.linear_fc1.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.24.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.11.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.30.mlp.linear_fc2.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.9.mlp.linear_fc2.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.24.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.31.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.10.input_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.11.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.24.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.11.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.31.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.10.self_attention.linear_proj.weight key (1.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.24.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.11.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.31.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.10.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.24.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.31.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.10.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.24.mlp.linear_fc2.weight key (1.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.11.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.25.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.31.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.11.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.10.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.25.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.11.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.31.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.10.mlp.linear_fc1.weight key (1.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.25.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.31.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.10.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.12.input_layernorm.weight key (0.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.32.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.25.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.11.input_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.12.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.32.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.12.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.25.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.11.self_attention.linear_proj.weight key (1.0, 1.0, False, False) + + + +_get_param_groups name module.module.decoder.layers.25.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.11.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.32.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.12.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.11.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.32.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.25.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.12.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) + + + +_get_param_groups name module.module.decoder.layers.12.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.11.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.32.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.26.input_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.12.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.11.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.32.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.26.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.13.input_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.11.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.32.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.12.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.13.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.33.input_layernorm.weight key (0.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.13.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.33.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.12.self_attention.linear_proj.weight key (1.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.13.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.26.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.12.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.33.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.12.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.33.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.13.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.13.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.12.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.33.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.12.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.13.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.33.mlp.linear_fc1.weight key (1.0, 1.0, False, False) + + + +_get_param_groups name module.module.decoder.layers.33.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.12.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.14.input_layernorm.weight key (0.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.26.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.13.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.34.input_layernorm.weight key (0.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.14.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.13.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.14.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.34.self_attention.linear_proj.weight key (1.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.26.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.14.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.13.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.34.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.26.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.34.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.13.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.26.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.14.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.13.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.34.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.27.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.14.mlp.linear_fc1.weight key (1.0, 1.0, False, False) + + + +_get_param_groups name module.module.decoder.layers.13.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.34.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.14.mlp.linear_fc2.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.27.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.13.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.34.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.15.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.27.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.14.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.35.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.27.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.15.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.14.self_attention.linear_proj.weight key (1.0, 1.0, False, False) + + + +_get_param_groups name module.module.decoder.layers.14.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.15.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.35.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.27.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.14.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.15.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.35.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.27.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.35.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.14.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.15.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.27.mlp.linear_fc2.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.14.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.15.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.28.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.35.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.14.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.15.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.35.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.28.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.15.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.16.input_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.35.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.28.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.15.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.28.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.36.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.16.self_attention.linear_proj.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.15.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.16.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.28.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.36.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.15.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.16.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.28.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.36.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.15.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.36.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.16.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.28.mlp.linear_fc2.weight key (1.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.15.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.16.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.29.input_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.36.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.15.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.16.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.36.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.29.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.16.input_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.17.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.36.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.29.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.16.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.17.self_attention.linear_proj.weight key (1.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.29.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.37.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.16.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.17.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.29.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.16.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.17.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.37.self_attention.linear_proj.weight key (1.0, 1.0, False, False) + + + +_get_param_groups name module.module.decoder.layers.29.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.37.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.16.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.17.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.29.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.37.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.16.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.17.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.30.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.16.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.37.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.17.mlp.linear_fc2.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.30.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.17.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.37.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.18.input_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.30.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.17.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.37.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.18.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.30.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.17.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.38.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.18.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.17.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.30.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.18.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.38.self_attention.linear_proj.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.17.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.30.mlp.linear_fc1.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.38.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.30.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.18.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.17.mlp.linear_fc1.weight key (1.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.38.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.31.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.18.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.17.mlp.linear_fc2.weight key (1.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.38.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.18.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.18.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.31.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.38.mlp.linear_fc1.weight key (1.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.19.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.38.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.31.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.18.self_attention.linear_proj.weight key (1.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.31.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.39.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.18.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.19.self_attention.linear_proj.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.18.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.31.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.39.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.19.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) + + + +_get_param_groups name module.module.decoder.layers.19.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.18.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.39.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.31.mlp.linear_fc1.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.19.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.39.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.18.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.31.mlp.linear_fc2.weight key (1.0, 1.0, False, False) + + + +_get_param_groups name module.module.decoder.layers.18.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.32.input_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.19.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.39.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.19.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.19.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.32.self_attention.linear_proj.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.39.mlp.linear_fc1.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.32.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.20.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.39.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.19.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.32.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.40.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.19.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.20.self_attention.linear_proj.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.32.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.40.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.19.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.20.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.32.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.20.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.40.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.19.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.32.mlp.linear_fc2.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.40.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.20.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.19.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.33.input_layernorm.weight key (0.0, 1.0, False, False) + + + +_get_param_groups name module.module.decoder.layers.19.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.20.mlp.linear_fc1.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.40.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.33.self_attention.linear_proj.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.20.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.20.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.33.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.40.mlp.linear_fc1.weight key (1.0, 1.0, False, False) + + + +_get_param_groups name module.module.decoder.layers.21.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.33.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.40.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.20.self_attention.linear_proj.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.21.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.33.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.20.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.41.input_layernorm.weight key (0.0, 1.0, False, False) + + + +_get_param_groups name module.module.decoder.layers.20.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.33.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.21.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.41.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.21.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.33.mlp.linear_fc2.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.41.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.20.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.34.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.41.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.21.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.20.mlp.linear_fc1.weight key (1.0, 1.0, False, False) + + + +_get_param_groups name module.module.decoder.layers.20.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.21.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.34.self_attention.linear_proj.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.41.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.21.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.21.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.34.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.41.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.21.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.22.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.34.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.41.mlp.linear_fc2.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.21.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.22.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.42.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.34.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.21.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) + + + +_get_param_groups name module.module.decoder.layers.22.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.34.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.42.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.21.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.22.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.34.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.42.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.21.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.22.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.35.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.42.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) + + + +_get_param_groups name module.module.decoder.layers.21.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.35.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.22.mlp.linear_fc1.weight key (1.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.42.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.22.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.35.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.22.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.42.mlp.linear_fc1.weight key (1.0, 1.0, False, False) + + + +_get_param_groups name module.module.decoder.layers.23.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.35.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.42.mlp.linear_fc2.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.22.self_attention.linear_proj.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.23.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.43.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.22.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.35.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.23.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.22.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.35.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.43.self_attention.linear_proj.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.23.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.35.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.43.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.22.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.23.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.36.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.43.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.22.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.23.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.22.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.36.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.43.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.23.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.23.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.36.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.43.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.24.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.36.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.43.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.23.self_attention.linear_proj.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.24.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.36.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.23.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.44.input_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.24.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.36.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.23.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.24.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.44.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.36.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.44.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.23.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.24.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.37.input_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.44.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.23.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.24.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.37.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.23.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.44.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.24.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.37.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.24.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.44.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.25.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.37.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) + + + +_get_param_groups name module.module.decoder.layers.44.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.37.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.24.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.25.self_attention.linear_proj.weight key (1.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.45.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.37.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.24.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.25.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.45.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.25.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.24.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.37.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.45.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) + + + +_get_param_groups name module.module.decoder.layers.45.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.38.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.24.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.25.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.45.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.24.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.38.self_attention.linear_proj.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.25.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.45.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.24.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.38.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.25.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.38.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.45.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.25.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.26.input_layernorm.weight key (0.0, 1.0, False, False) + + + +_get_param_groups name module.module.decoder.layers.25.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.46.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.26.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.38.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.25.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.38.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.26.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.46.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.25.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.26.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.38.mlp.linear_fc2.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.46.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.25.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.26.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.39.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.46.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.25.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.26.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.39.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.25.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.46.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.26.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.39.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.26.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.46.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.27.input_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.39.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.46.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.26.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.27.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.39.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.26.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.27.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.26.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.39.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.47.input_layernorm.weight key (0.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.27.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.39.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.26.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.47.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.27.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.40.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.26.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.47.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.27.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.26.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.40.self_attention.linear_proj.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.47.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.27.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.40.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.27.input_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.28.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.47.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.40.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.27.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.47.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.28.self_attention.linear_proj.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.40.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.27.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.47.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.28.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.27.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.40.mlp.linear_fc1.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.28.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.final_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.40.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.27.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.output_layer.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.28.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.41.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.27.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.28.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.27.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.41.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.28.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.41.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.28.input_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.29.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.41.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.28.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.29.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.28.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.41.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.29.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.28.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.41.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.29.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.41.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.28.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.29.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.28.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.42.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.29.mlp.linear_fc1.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.28.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.29.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.42.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.29.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.30.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.42.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.42.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.29.self_attention.linear_proj.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.30.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.29.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.30.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.42.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.29.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.30.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.42.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.42.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.29.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.30.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.43.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.29.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.30.mlp.linear_fc1.weight key (1.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.30.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.29.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.43.self_attention.linear_proj.weight key (1.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.43.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.31.input_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.30.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.43.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.31.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.30.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.43.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.31.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.30.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.30.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.31.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.43.mlp.linear_fc1.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.43.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.30.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.31.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.44.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.30.mlp.linear_fc1.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.31.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.external_feature_model.vision_projection.encoder.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.30.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.31.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.44.self_attention.linear_proj.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.31.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.44.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.32.input_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.44.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.31.self_attention.linear_proj.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.32.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.31.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.44.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.32.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.31.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.32.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.44.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.31.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.44.mlp.linear_fc2.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.32.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.45.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.31.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.32.mlp.linear_fc1.weight key (1.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.32.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.31.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.45.self_attention.linear_proj.weight key (1.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.32.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.45.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.33.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.external_feature_model.vision_projection.encoder.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.45.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.32.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.33.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.32.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.33.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.45.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.external_feature_model.pre_proj_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.32.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.33.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.45.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.external_feature_model.pre_proj_layernorm.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.32.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.33.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.45.mlp.linear_fc2.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.32.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.embedding.word_embeddings.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.33.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.46.input_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.32.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.33.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.46.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.0.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.33.input_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.34.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.46.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.33.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.46.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.0.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.34.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.33.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.46.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.0.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.34.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.33.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.46.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.34.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.0.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.33.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.46.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.33.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.34.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.0.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.33.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.34.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.0.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.47.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.34.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.34.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.0.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.47.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.34.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.35.input_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.1.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.47.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.34.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.35.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.47.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.1.self_attention.linear_proj.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.34.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.35.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.1.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.47.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.35.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.34.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.1.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.47.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.34.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.35.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.1.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.34.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.47.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.35.mlp.linear_fc1.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.1.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.35.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.35.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.1.mlp.linear_fc2.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.final_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.35.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.36.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.2.input_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.35.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.output_layer.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.35.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.36.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.2.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.36.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.35.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.2.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.36.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.2.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.35.mlp.linear_fc1.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.35.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.36.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.2.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.2.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.36.input_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.36.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.2.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.36.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.36.self_attention.linear_proj.weight key (1.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.36.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.3.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.37.input_layernorm.weight key (0.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.36.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.37.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.3.self_attention.linear_proj.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.36.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.37.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.3.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.36.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.37.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.3.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.36.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.3.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.37.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.37.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.3.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.37.mlp.linear_fc1.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.37.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.3.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.37.mlp.linear_fc2.weight key (1.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.37.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.4.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.38.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.37.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.4.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.38.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.37.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.4.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.38.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.37.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.4.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.38.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.37.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.4.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.38.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.38.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.4.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.38.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.4.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.38.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.38.mlp.linear_fc2.weight key (1.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.38.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.5.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.39.input_layernorm.weight key (0.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.38.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.5.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.39.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.38.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.5.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.39.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.5.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.39.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.38.mlp.linear_fc1.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.38.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.5.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.39.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.39.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.5.mlp.linear_fc1.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.39.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.5.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.39.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.39.mlp.linear_fc2.weight key (1.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.39.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.40.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.6.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.39.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.40.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.6.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.39.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.40.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.6.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.39.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.40.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.6.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.39.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.40.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.6.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.40.input_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.40.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.6.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.40.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.40.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.6.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.40.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.41.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.40.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.7.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.41.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.40.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.7.self_attention.linear_proj.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.41.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.40.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.7.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.41.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.7.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.40.mlp.linear_fc2.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.41.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.41.input_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.7.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.41.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.41.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.7.mlp.linear_fc1.weight key (1.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.41.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.7.mlp.linear_fc2.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.41.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.41.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.8.input_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.42.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.8.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.41.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.42.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.8.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.41.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.42.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.8.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.42.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.41.mlp.linear_fc2.weight key (1.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.42.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.8.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.42.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.42.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.8.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.42.mlp.linear_fc1.weight key (1.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.8.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.42.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.42.mlp.linear_fc2.weight key (1.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.9.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.42.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.43.input_layernorm.weight key (0.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.43.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.9.self_attention.linear_proj.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.42.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.43.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.9.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.42.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.43.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.9.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.42.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.43.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.9.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.43.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.9.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.43.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.43.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.9.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.43.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.43.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.10.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.44.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.43.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.10.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.44.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.43.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.10.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.44.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.43.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.10.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.44.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.43.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.10.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.44.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.44.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.10.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.44.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.44.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.10.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.44.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.44.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.11.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.44.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.45.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.11.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.44.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.45.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.11.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.44.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.45.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.11.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.44.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.45.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.11.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.45.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.45.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.11.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.45.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.45.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.11.mlp.linear_fc2.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.45.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.45.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.12.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.45.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.46.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.12.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.45.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.46.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.12.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.45.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.12.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.46.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.45.mlp.linear_fc2.weight key (1.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.46.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.46.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.12.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.46.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.12.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.46.self_attention.linear_proj.weight key (1.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.46.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.46.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.12.mlp.linear_fc2.weight key (1.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.46.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.46.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.13.input_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.46.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.13.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.46.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.13.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.47.input_layernorm.weight key (0.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.13.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.46.mlp.linear_fc2.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.47.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.47.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.13.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.47.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.47.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.13.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.13.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.47.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.47.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.14.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.47.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.47.mlp.linear_fc1.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.47.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.47.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.14.self_attention.linear_proj.weight key (1.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.14.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.final_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.47.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.14.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.output_layer.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.47.mlp.linear_fc1.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.14.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.47.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.14.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.final_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.output_layer.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.14.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.15.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.15.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.15.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.15.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.15.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.15.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.15.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.16.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.16.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.16.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.16.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.16.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.16.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.16.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.17.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.17.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.17.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.17.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.17.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.17.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.17.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.18.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.18.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.18.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.external_feature_model.vision_projection.encoder.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.18.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.18.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.18.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.external_feature_model.vision_projection.encoder.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.18.mlp.linear_fc2.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.19.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.19.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.19.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.19.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.19.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.19.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.19.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.external_feature_model.vision_projection.encoder.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.20.input_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.20.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.external_feature_model.pre_proj_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.20.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.external_feature_model.vision_projection.encoder.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.20.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.external_feature_model.pre_proj_layernorm.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.external_feature_model.pre_proj_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.20.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.embedding.word_embeddings.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.20.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.external_feature_model.pre_proj_layernorm.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.20.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.0.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.embedding.word_embeddings.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.21.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.21.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.0.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.0.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.21.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.21.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.0.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.0.self_attention.linear_proj.weight key (1.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.0.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.21.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.0.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.0.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.21.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.0.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.21.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.0.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.0.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.22.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.0.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.0.mlp.linear_fc1.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.22.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.0.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.1.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.22.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.22.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.1.input_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.1.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.22.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.1.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.1.self_attention.linear_proj.weight key (1.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.1.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.22.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.1.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.1.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.22.mlp.linear_fc2.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.1.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.23.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.1.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.1.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.1.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.23.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.1.mlp.linear_fc1.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.2.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.23.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.1.mlp.linear_fc2.weight key (1.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.23.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.2.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.2.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.23.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.2.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.2.self_attention.linear_proj.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.23.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.2.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.2.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.2.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.23.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.2.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.24.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.2.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.2.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.24.self_attention.linear_proj.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.2.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.2.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.24.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.2.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.24.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.3.input_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.3.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.3.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.24.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.3.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.3.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.24.mlp.linear_fc1.weight key (1.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.3.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.24.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.3.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.25.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.3.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.3.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.3.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.25.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.3.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.3.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.25.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.3.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.25.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.4.input_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.3.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.25.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.4.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.4.input_layernorm.weight key (0.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.25.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.4.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.4.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.4.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.25.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.4.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.4.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.26.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.4.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.4.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.4.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.26.self_attention.linear_proj.weight key (1.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.4.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.4.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.26.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.5.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.4.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.26.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.5.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.5.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.26.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.5.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.5.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.26.mlp.linear_fc1.weight key (1.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.26.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.5.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.5.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.5.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.27.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.5.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.5.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.27.self_attention.linear_proj.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.5.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.27.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.5.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.5.mlp.linear_fc2.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.5.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.27.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.6.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.6.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.27.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.6.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.6.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.6.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.27.mlp.linear_fc1.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.6.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.6.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.27.mlp.linear_fc2.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.6.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.28.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.6.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.6.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.6.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.6.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.28.self_attention.linear_proj.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.6.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.6.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.28.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.28.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.7.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.7.input_layernorm.weight key (0.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.7.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.28.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.7.self_attention.linear_proj.weight key (1.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.7.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.7.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.28.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.7.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.7.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.28.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.29.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.7.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.7.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.7.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.7.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.29.self_attention.linear_proj.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.7.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.7.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.29.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.29.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.8.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.8.input_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.29.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.8.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.8.self_attention.linear_proj.weight key (1.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.29.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.8.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.8.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.29.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.8.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.8.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.30.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.8.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.8.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.30.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.8.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.8.mlp.linear_fc1.weight key (1.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.8.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.30.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.8.mlp.linear_fc2.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.30.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.9.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.9.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.30.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.9.self_attention.linear_proj.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.9.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.30.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.9.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.9.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.9.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.9.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.30.mlp.linear_fc2.weight key (1.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.31.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.9.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.9.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.9.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.9.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.31.self_attention.linear_proj.weight key (1.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.9.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.9.mlp.linear_fc2.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.31.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.31.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.10.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.10.input_layernorm.weight key (0.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.10.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.31.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.10.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.10.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.31.mlp.linear_fc1.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.10.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.10.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.31.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.10.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.32.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.10.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.10.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.32.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.10.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.32.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.10.mlp.linear_fc2.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.10.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.32.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.11.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.10.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.32.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.11.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.32.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.11.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.11.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.32.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.11.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.11.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.external_feature_model.vision_projection.encoder.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.33.input_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.11.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.11.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.33.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.11.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.11.mlp.linear_fc1.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.33.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.11.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.11.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.33.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.12.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.11.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.33.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.11.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.12.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.33.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.12.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.12.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.33.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.12.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.12.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.34.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.12.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.12.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.12.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.34.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.12.mlp.linear_fc1.weight key (1.0, 1.0, False, False) + + +_get_param_groups name module.module.external_feature_model.vision_projection.encoder.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.34.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.12.mlp.linear_fc2.weight key (1.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.12.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.34.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.13.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.12.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.34.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.external_feature_model.pre_proj_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.12.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.13.self_attention.linear_proj.weight key (1.0, 1.0, False, False) + + + +_get_param_groups name module.module.decoder.layers.34.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.13.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.13.input_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.external_feature_model.pre_proj_layernorm.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.13.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.34.mlp.linear_fc2.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.13.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.embedding.word_embeddings.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.35.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.13.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.13.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.0.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.13.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.13.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.35.self_attention.linear_proj.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.13.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.35.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.13.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.0.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.35.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.14.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.13.mlp.linear_fc1.weight key (1.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.0.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.13.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.0.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.35.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.14.self_attention.linear_proj.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.14.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.35.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.14.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.0.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.14.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.35.mlp.linear_fc2.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.14.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.0.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.14.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.36.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.0.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.14.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) + + + +_get_param_groups name module.module.decoder.layers.14.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.1.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.36.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.14.mlp.linear_fc1.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.14.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.1.self_attention.linear_proj.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.14.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.36.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.1.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.14.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.36.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.15.input_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.1.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.14.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.15.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.36.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.15.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.1.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.36.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.15.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.1.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.15.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.36.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.15.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.1.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.15.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.15.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.37.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.2.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.15.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) + + + +_get_param_groups name module.module.decoder.layers.15.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.15.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.2.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.37.self_attention.linear_proj.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.15.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.15.mlp.linear_fc1.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.2.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.37.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.15.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.16.input_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.2.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.37.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.16.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.16.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.2.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.37.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.16.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.16.self_attention.linear_proj.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.2.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.37.mlp.linear_fc1.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.16.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.16.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.2.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.37.mlp.linear_fc2.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.16.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.16.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.3.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.38.input_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.16.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.16.mlp.linear_fc1.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.16.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.3.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.16.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.38.self_attention.linear_proj.weight key (1.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.17.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.16.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.38.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.3.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.17.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.17.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.38.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.3.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.17.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.17.self_attention.linear_proj.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.38.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.3.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.external_feature_model.vision_projection.encoder.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.17.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.17.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.3.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.38.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.17.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.17.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.3.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.38.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.17.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.17.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.4.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.39.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.17.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.17.mlp.linear_fc1.weight key (1.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.4.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.17.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.18.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.39.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.4.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.18.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.4.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.18.self_attention.linear_proj.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.39.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.39.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.18.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.18.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.4.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.18.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.18.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.4.mlp.linear_fc1.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.39.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.18.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.18.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.4.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.39.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.18.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.39.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.5.input_layernorm.weight key (0.0, 1.0, False, False) + + +_get_param_groups name module.module.external_feature_model.vision_projection.encoder.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.18.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.18.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.40.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.5.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.19.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.5.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.40.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.18.mlp.linear_fc1.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.external_feature_model.pre_proj_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.19.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.5.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.40.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.18.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.external_feature_model.pre_proj_layernorm.bias key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.40.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.19.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.5.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) + + +_get_param_groups name module.module.embedding.word_embeddings.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.19.input_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.19.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.5.mlp.linear_fc1.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.40.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.5.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.0.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.19.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.19.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.40.mlp.linear_fc1.weight key (1.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.6.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.19.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.0.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.40.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.19.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.6.self_attention.linear_proj.weight key (1.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.0.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.19.mlp.linear_fc2.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.6.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.41.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.19.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.0.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.20.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.6.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.41.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.20.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.6.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.0.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.19.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.41.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) + + + + +_get_param_groups name module.module.decoder.layers.6.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.20.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.41.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.0.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.19.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.6.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.20.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.0.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.41.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.19.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.7.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.1.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.41.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.20.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.20.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.7.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.41.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.20.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.1.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.7.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.20.self_attention.linear_proj.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.42.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.20.mlp.linear_fc2.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.1.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.7.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.20.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.21.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.1.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.42.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.20.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.7.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.21.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.42.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.1.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.7.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.20.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.42.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.21.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.1.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.7.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.20.mlp.linear_fc1.weight key (1.0, 1.0, False, False) + + + +_get_param_groups name module.module.decoder.layers.21.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.20.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.1.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.8.input_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.42.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.21.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.21.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.2.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.8.self_attention.linear_proj.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.42.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.21.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.8.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.42.mlp.linear_fc2.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.21.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.2.self_attention.linear_proj.weight key (1.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.8.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.21.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.43.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.2.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.21.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.8.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.22.input_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.2.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.21.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.43.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.8.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.22.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.21.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.2.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.43.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.8.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.22.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.21.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.43.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.2.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.9.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.22.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.21.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.2.mlp.linear_fc2.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.43.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.9.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.22.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.22.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.3.input_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.43.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.9.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.22.mlp.linear_fc1.weight key (1.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.22.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.9.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.3.self_attention.linear_proj.weight key (1.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.43.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.22.mlp.linear_fc2.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.22.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.3.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.9.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.44.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.23.input_layernorm.weight key (0.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.22.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.3.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.9.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.23.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.44.self_attention.linear_proj.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.22.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.3.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.9.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.23.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.44.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.22.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.10.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.3.mlp.linear_fc1.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.23.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.44.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.22.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.3.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.10.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.23.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.44.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.23.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.4.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.10.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) + + + + +_get_param_groups name module.module.decoder.layers.10.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.23.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.44.mlp.linear_fc1.weight key (1.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.4.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.23.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.23.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.44.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.10.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.4.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.23.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.4.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.45.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.24.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.10.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.23.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) + + + + +_get_param_groups name module.module.decoder.layers.10.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.23.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.24.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.4.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.45.self_attention.linear_proj.weight key (1.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.11.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.23.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.24.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.45.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.4.mlp.linear_fc1.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.23.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.24.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.11.self_attention.linear_proj.weight key (1.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.45.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.4.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.24.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.11.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.24.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.45.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.5.input_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.11.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.24.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.24.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.45.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.5.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.24.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.24.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.11.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.45.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.5.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.24.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.25.input_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.11.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.46.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.5.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.11.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.24.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.25.self_attention.linear_proj.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.46.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.5.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.12.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.24.mlp.linear_fc1.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.25.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.46.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.5.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.24.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.25.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.12.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.46.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.5.mlp.linear_fc2.weight key (1.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.25.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.12.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.6.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.25.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.46.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.12.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.25.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.25.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.46.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.6.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.25.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.25.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.46.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.12.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.6.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.25.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.6.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.26.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.12.mlp.linear_fc1.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.25.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.12.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.47.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.6.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.26.self_attention.linear_proj.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.25.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.13.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.26.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.6.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.47.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.25.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.26.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.13.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.6.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.47.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.26.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.13.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.7.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.47.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.26.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) + + + +_get_param_groups name module.module.decoder.layers.26.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.13.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.26.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.7.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.26.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.47.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.26.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.13.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.7.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.26.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.47.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.7.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.13.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.27.input_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.47.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.26.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.13.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.7.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.27.self_attention.linear_proj.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.final_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.14.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.27.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.7.mlp.linear_fc1.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.26.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.output_layer.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.27.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.7.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.14.self_attention.linear_proj.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.26.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.14.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.8.input_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.27.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.14.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.27.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.27.mlp.linear_fc1.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.8.self_attention.linear_proj.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.27.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.8.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.14.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.8.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.28.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.27.self_attention.linear_proj.weight key (1.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.14.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.28.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.14.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.8.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.27.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.external_feature_model.vision_projection.encoder.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.28.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.8.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.15.input_layernorm.weight key (0.0, 1.0, False, False) + + + +_get_param_groups name module.module.decoder.layers.27.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.28.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.8.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.15.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.9.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.27.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.28.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.15.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.28.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.15.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.9.self_attention.linear_proj.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.27.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.28.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.9.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.15.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.9.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.27.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.29.input_layernorm.weight key (0.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.15.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.15.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.9.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.29.self_attention.linear_proj.weight key (1.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.28.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.16.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.29.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.9.mlp.linear_fc1.weight key (1.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.28.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.29.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.9.mlp.linear_fc2.weight key (1.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.16.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.external_feature_model.vision_projection.encoder.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.10.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.16.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.29.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.28.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.16.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.10.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.28.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.29.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.10.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.16.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.29.mlp.linear_fc2.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.28.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.10.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.16.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.30.input_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.28.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.16.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.10.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.28.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.30.self_attention.linear_proj.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.external_feature_model.vision_projection.encoder.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.17.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.10.mlp.linear_fc1.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.30.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.29.input_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.10.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.30.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.17.self_attention.linear_proj.weight key (1.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.29.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.external_feature_model.pre_proj_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.11.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.17.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.29.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.30.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.17.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.29.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.external_feature_model.pre_proj_layernorm.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.11.self_attention.linear_proj.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.30.mlp.linear_fc1.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.external_feature_model.vision_projection.encoder.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.17.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.11.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.30.mlp.linear_fc2.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.29.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.embedding.word_embeddings.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.17.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.11.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.31.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.29.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.external_feature_model.pre_proj_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.17.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.11.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.29.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.31.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.0.input_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.18.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.external_feature_model.pre_proj_layernorm.bias key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.11.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.30.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.31.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.11.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.18.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.0.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.embedding.word_embeddings.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.31.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.30.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.12.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.18.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.0.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.30.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.18.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.31.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.12.self_attention.linear_proj.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.0.input_layernorm.weight key (0.0, 1.0, False, False) + + + +_get_param_groups name module.module.decoder.layers.0.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.30.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.12.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.31.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.18.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.0.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.12.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.31.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.30.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.18.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.0.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.0.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.32.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.12.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.30.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.18.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.0.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.0.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.12.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.30.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.32.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.19.input_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.0.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.0.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.12.mlp.linear_fc2.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.32.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.31.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.19.self_attention.linear_proj.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.1.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.13.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.0.mlp.linear_fc1.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.32.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.19.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.31.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.0.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.13.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.19.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.1.self_attention.linear_proj.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.32.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.31.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.13.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.1.input_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.31.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.1.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.32.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.19.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.13.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.1.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.1.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.31.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.32.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.19.mlp.linear_fc1.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.13.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.1.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.19.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.31.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.33.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.13.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.1.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.1.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) + + + + + +_get_param_groups name module.module.decoder.layers.13.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.31.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.20.input_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.1.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.33.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.1.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.14.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.32.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.1.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.20.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.33.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.1.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.14.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.32.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.2.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.33.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.20.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.1.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.14.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.32.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.20.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.14.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.2.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.33.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.2.self_attention.linear_proj.weight key (1.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.32.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.20.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.33.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.14.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.2.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.2.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.32.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.33.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.2.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.14.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.20.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.2.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.32.mlp.linear_fc1.weight key (1.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.14.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.34.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.2.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.20.mlp.linear_fc2.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.32.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.2.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.15.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.34.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.21.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.33.input_layernorm.weight key (0.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.2.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.2.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.15.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.34.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.21.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.2.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.33.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.2.mlp.linear_fc1.weight key (1.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.34.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.15.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.21.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.33.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.2.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.3.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.15.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) + + + +_get_param_groups name module.module.decoder.layers.34.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.21.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.33.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.3.input_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.34.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.3.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.15.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.21.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.34.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.33.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.3.self_attention.linear_proj.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.3.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.15.mlp.linear_fc1.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.21.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.35.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.33.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.3.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.3.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.15.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.21.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.3.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.33.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.35.self_attention.linear_proj.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.16.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.22.input_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.3.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.35.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.34.input_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.3.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.3.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.22.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.16.self_attention.linear_proj.weight key (1.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.35.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.3.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.34.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.22.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.3.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.16.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.3.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.35.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.34.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.22.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.16.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.4.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.34.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.35.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.4.input_layernorm.weight key (0.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.22.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.16.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.4.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.35.mlp.linear_fc2.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.4.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.34.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.22.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.16.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.4.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.36.input_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.4.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.34.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.22.mlp.linear_fc2.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.16.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.4.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.4.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.36.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.23.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.17.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.34.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.4.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.36.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.4.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.23.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.17.self_attention.linear_proj.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.35.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.36.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.4.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.23.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.4.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.17.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) + + + +_get_param_groups name module.module.decoder.layers.36.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.35.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.23.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.4.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.17.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.4.mlp.linear_fc2.weight key (1.0, 1.0, False, False) + + + + + +_get_param_groups name module.module.decoder.layers.36.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.5.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.5.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.17.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.23.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.35.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.36.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.17.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.23.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.5.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.5.self_attention.linear_proj.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.35.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.37.input_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.17.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.23.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.5.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.5.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.37.self_attention.linear_proj.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.18.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.24.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.35.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.5.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.37.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.18.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.37.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.24.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.5.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.35.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.18.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.24.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.37.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.5.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.18.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.24.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.35.mlp.linear_fc2.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.37.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.5.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.18.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.37.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.24.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.36.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.6.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.18.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.38.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.24.mlp.linear_fc1.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.36.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.5.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.18.mlp.linear_fc2.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.6.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.24.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.38.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.36.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.19.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.6.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.25.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.5.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.38.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.36.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.6.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.19.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.38.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.25.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.5.mlp.linear_fc1.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.36.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.19.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.6.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.38.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.25.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.19.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.36.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.5.mlp.linear_fc2.weight key (1.0, 1.0, False, False) + + + + +_get_param_groups name module.module.decoder.layers.6.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.25.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.38.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.36.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.6.input_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.6.mlp.linear_fc2.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.19.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.37.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.38.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.25.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.19.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.7.input_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.6.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.39.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.25.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.37.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.19.mlp.linear_fc2.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.6.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.7.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.25.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.37.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.39.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.20.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.6.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.7.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.37.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.26.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.39.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.20.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.7.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.6.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.39.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.26.self_attention.linear_proj.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.37.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.20.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.6.mlp.linear_fc1.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.7.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.26.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.20.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.6.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.39.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.37.mlp.linear_fc1.weight key (1.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.7.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.26.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.37.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.7.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.39.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.20.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.7.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.26.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.38.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.39.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.20.mlp.linear_fc1.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.7.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.8.input_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.26.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.20.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.40.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.38.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.7.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.8.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.26.mlp.linear_fc2.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.21.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.38.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.7.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.40.self_attention.linear_proj.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.8.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.27.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.38.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.21.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.40.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) + + + + +_get_param_groups name module.module.decoder.layers.7.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.8.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.40.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.21.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.7.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.38.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.27.self_attention.linear_proj.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.21.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.8.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.7.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.38.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.27.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.40.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) + + + + + +_get_param_groups name module.module.decoder.layers.27.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.38.mlp.linear_fc2.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.8.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.8.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.40.mlp.linear_fc1.weight key (1.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.21.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.39.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.8.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.27.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.40.mlp.linear_fc2.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.21.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.8.self_attention.linear_proj.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.39.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.41.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.9.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.21.mlp.linear_fc2.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.27.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.8.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.39.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.22.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.8.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.41.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.39.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.27.mlp.linear_fc2.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.9.self_attention.linear_proj.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.22.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.41.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.28.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.8.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.9.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.39.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) + + + +_get_param_groups name module.module.decoder.layers.41.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.22.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.9.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.39.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.28.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.8.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.22.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.41.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.28.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.39.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.8.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.9.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.41.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.22.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.28.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.40.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.9.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.9.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.22.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.41.mlp.linear_fc2.weight key (1.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.28.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.40.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.9.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.9.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.22.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.42.input_layernorm.weight key (0.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.28.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.40.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.9.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.23.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.10.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.40.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.28.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.42.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.9.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) + + + + + +_get_param_groups name module.module.decoder.layers.29.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.42.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.23.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.40.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.9.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.10.self_attention.linear_proj.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.42.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.29.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.23.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.40.mlp.linear_fc1.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.10.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.9.mlp.linear_fc1.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.23.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.42.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.40.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.29.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.10.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.9.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.29.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.42.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.41.input_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.23.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.10.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.10.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.42.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.29.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.23.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.41.self_attention.linear_proj.weight key (1.0, 1.0, False, False) + + + +_get_param_groups name module.module.decoder.layers.10.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.10.self_attention.linear_proj.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.23.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.29.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.41.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.43.input_layernorm.weight key (0.0, 1.0, False, False) + + + +_get_param_groups name module.module.decoder.layers.10.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.10.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.41.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.24.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.29.mlp.linear_fc2.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.10.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.43.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.11.input_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.30.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.41.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.24.self_attention.linear_proj.weight key (1.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.43.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.10.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.11.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.41.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.24.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.43.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.10.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.30.self_attention.linear_proj.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.11.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.24.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.41.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.10.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.30.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.11.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.43.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.42.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.24.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.30.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.11.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.43.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.11.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.24.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.42.self_attention.linear_proj.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.11.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.30.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.43.mlp.linear_fc2.weight key (1.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.11.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.42.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.24.mlp.linear_fc2.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.11.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.30.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.44.input_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.11.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.42.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.25.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.11.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.30.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.12.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.44.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.42.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.25.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.11.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.31.input_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.44.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.12.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.42.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.25.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.11.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.44.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.31.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.12.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.25.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.42.mlp.linear_fc2.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.11.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.31.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.12.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.44.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) + + + +_get_param_groups name module.module.decoder.layers.43.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.25.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.31.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.44.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.12.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.12.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.25.mlp.linear_fc1.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.43.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.44.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.31.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.25.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.12.mlp.linear_fc1.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.12.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.45.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.43.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.31.mlp.linear_fc1.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.12.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.26.input_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.12.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.45.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.31.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.43.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.12.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.13.input_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.26.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.45.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.32.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.45.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.26.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.12.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.13.self_attention.linear_proj.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.43.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.32.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.26.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.45.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.13.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.12.mlp.linear_fc1.weight key (1.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.32.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.43.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.26.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.13.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.12.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.45.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.32.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.26.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.45.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.13.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.13.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.43.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.32.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.26.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.13.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.46.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.32.mlp.linear_fc1.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.13.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.44.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.27.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.13.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.32.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.13.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.46.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.44.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.14.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.27.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.33.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.13.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.46.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.44.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.27.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.46.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.14.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.44.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.33.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.13.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.27.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.14.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.33.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.46.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.44.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.13.mlp.linear_fc1.weight key (1.0, 1.0, False, False) + + + +_get_param_groups name module.module.decoder.layers.14.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.27.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.33.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.44.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.13.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.46.mlp.linear_fc1.weight key (1.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.27.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.14.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.44.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.14.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.33.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.46.mlp.linear_fc2.weight key (1.0, 1.0, False, False) + + + +_get_param_groups name module.module.decoder.layers.27.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.14.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.33.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.45.input_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.14.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.28.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.14.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.47.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.14.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.33.mlp.linear_fc2.weight key (1.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.45.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.14.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.15.input_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.28.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.47.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.34.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.45.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.28.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.45.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.47.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.15.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.14.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.34.self_attention.linear_proj.weight key (1.0, 1.0, False, False) + + + +_get_param_groups name module.module.decoder.layers.28.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.47.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.14.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.45.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.15.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.34.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.28.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.14.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.47.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.34.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.15.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.45.mlp.linear_fc1.weight key (1.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.28.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.15.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.47.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.45.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.34.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.28.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.15.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.15.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.47.mlp.linear_fc2.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.46.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.34.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.29.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.15.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.15.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.final_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.34.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.46.self_attention.linear_proj.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.15.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.29.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.15.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) + +_get_param_groups name module.module.output_layer.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.external_feature_model.vision_projection.encoder.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.46.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.35.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.16.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.29.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.15.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.46.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.35.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.29.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.16.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.15.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.46.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.35.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.29.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.16.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.35.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.46.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.29.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.16.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.46.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.35.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.29.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.16.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.35.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.15.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.30.input_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.47.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.35.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.16.mlp.linear_fc1.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.16.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.30.self_attention.linear_proj.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.36.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.47.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.16.mlp.linear_fc2.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.30.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.16.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.36.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.47.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.17.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.30.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.47.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.16.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.36.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.17.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.30.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.36.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.16.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.47.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.17.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.30.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.external_feature_model.vision_projection.encoder.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.36.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.16.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.47.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.17.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.30.mlp.linear_fc2.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.36.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.47.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.16.mlp.linear_fc1.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.31.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.17.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.external_feature_model.pre_proj_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.36.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.16.mlp.linear_fc2.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.final_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.31.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.17.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.37.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.external_feature_model.pre_proj_layernorm.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.17.input_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.output_layer.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.31.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.17.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.31.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.37.self_attention.linear_proj.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.17.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.embedding.word_embeddings.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.18.input_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.37.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.17.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.31.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.18.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.37.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.17.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.0.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.31.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.18.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.37.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.17.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.31.mlp.linear_fc2.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.18.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.0.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.37.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.32.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.17.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.18.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.0.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.37.mlp.linear_fc2.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.17.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.32.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.18.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.38.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.0.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.18.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.32.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.18.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.32.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.38.self_attention.linear_proj.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.18.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.0.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.19.input_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.38.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.18.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.32.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.0.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.19.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.38.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.18.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.32.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.0.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.19.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.38.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.32.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.18.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.19.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.1.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.38.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.33.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.18.mlp.linear_fc1.weight key (1.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.19.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.38.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.18.mlp.linear_fc2.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.1.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.33.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.19.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.39.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.19.input_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.33.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.19.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.1.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.33.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.39.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.19.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.20.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.1.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.39.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.19.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.33.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.39.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.20.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.19.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.1.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.33.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.20.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.33.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.1.mlp.linear_fc1.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.39.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.19.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.20.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.34.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.39.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.19.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.1.mlp.linear_fc2.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.20.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.39.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.34.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.2.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.19.mlp.linear_fc2.weight key (1.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.40.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.34.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.20.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.20.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.2.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.34.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.40.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.20.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.20.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.2.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.40.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.34.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.21.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.20.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.2.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.40.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.34.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.21.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.20.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.34.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.40.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.21.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.2.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.35.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.20.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.21.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.40.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.2.mlp.linear_fc1.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.20.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.35.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.21.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.2.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.40.mlp.linear_fc2.weight key (1.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.20.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.35.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.3.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.41.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.21.mlp.linear_fc1.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.21.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.35.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.21.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.3.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.41.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.21.self_attention.linear_proj.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.35.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.22.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.3.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.21.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.41.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.35.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.3.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.21.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.41.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.22.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.35.mlp.linear_fc2.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.3.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.21.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.41.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.36.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.3.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.22.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.21.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.41.mlp.linear_fc1.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.36.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.3.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.22.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.21.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.41.mlp.linear_fc2.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.36.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.4.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.22.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.22.input_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.36.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.42.input_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.4.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.22.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.22.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.42.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.36.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.4.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.22.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.22.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.42.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.36.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.4.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.22.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.23.input_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.36.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.42.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.4.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.23.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.22.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.37.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.42.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.4.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.23.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.22.mlp.linear_fc1.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.37.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.42.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.4.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.23.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.22.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.37.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.42.mlp.linear_fc2.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.5.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.37.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.23.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.23.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.43.input_layernorm.weight key (0.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.5.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.37.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.23.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.23.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.5.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.43.self_attention.linear_proj.weight key (1.0, 1.0, False, False) + + + +_get_param_groups name module.module.decoder.layers.37.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.5.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.43.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.23.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.23.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.37.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.43.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.23.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.24.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.5.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.38.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.43.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.23.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.24.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.5.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.38.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.43.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.23.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.24.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.5.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.38.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.43.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.23.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.24.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.38.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.6.input_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.44.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.24.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.24.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.38.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.6.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.44.self_attention.linear_proj.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.24.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.24.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.38.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.44.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.6.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.24.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.24.mlp.linear_fc2.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.38.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.44.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.6.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.24.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.25.input_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.39.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.44.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.6.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.25.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.39.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.24.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.44.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.6.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.25.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.39.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.24.mlp.linear_fc1.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.44.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.25.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.6.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.39.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) + + + +_get_param_groups name module.module.decoder.layers.24.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.45.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.7.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.25.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.25.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.39.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.45.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.7.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.25.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.25.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.45.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.39.mlp.linear_fc1.weight key (1.0, 1.0, False, False) + + + + +_get_param_groups name module.module.decoder.layers.7.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.45.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.39.mlp.linear_fc2.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.25.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.25.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.7.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.25.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.40.input_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.45.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.26.input_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.7.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.45.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.40.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.25.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.26.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.7.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.45.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.40.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.26.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.25.mlp.linear_fc1.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.7.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.40.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.46.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.26.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.25.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.8.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.46.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.40.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.26.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.26.input_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.8.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.46.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.40.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.26.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.8.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.26.self_attention.linear_proj.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.46.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.40.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.26.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.8.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.26.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.41.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.46.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.27.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.26.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.8.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.46.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.41.self_attention.linear_proj.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.27.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.8.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.26.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.46.mlp.linear_fc2.weight key (1.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.41.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.27.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.8.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.26.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.41.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.47.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.27.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.26.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.9.input_layernorm.weight key (0.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.41.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.47.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.27.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.41.mlp.linear_fc1.weight key (1.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.9.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.47.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.41.mlp.linear_fc2.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.9.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.27.self_attention.linear_proj.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.27.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.47.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.42.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.9.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.27.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.27.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.47.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.42.self_attention.linear_proj.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.27.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.9.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.27.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.42.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.47.mlp.linear_fc1.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.27.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.9.mlp.linear_fc1.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.42.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.47.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.28.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.9.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.27.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.final_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.42.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.10.input_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.28.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.27.mlp.linear_fc2.weight key (1.0, 1.0, False, False) + + +_get_param_groups name module.module.output_layer.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.42.mlp.linear_fc1.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.28.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.28.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.10.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.28.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.42.mlp.linear_fc2.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.10.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.28.self_attention.linear_proj.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.43.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.28.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.10.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.28.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.43.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.28.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.28.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.43.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.10.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.28.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.28.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.43.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.10.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.29.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.28.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.10.mlp.linear_fc2.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.43.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.29.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.28.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.11.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.43.mlp.linear_fc1.weight key (1.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.29.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.29.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.29.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.43.mlp.linear_fc2.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.11.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.11.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.29.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.44.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.29.self_attention.linear_proj.weight key (1.0, 1.0, False, False) + + + +_get_param_groups name module.module.decoder.layers.11.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.29.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.29.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.44.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.29.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.11.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.29.mlp.linear_fc2.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.44.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.29.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.11.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.30.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.44.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.29.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.11.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.30.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.44.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.29.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.12.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.30.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.44.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.30.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.30.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.12.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.44.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.30.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.12.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.30.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.45.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.30.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.12.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.30.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.30.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.45.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.12.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.30.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.45.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.30.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.external_feature_model.vision_projection.encoder.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.12.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.31.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.45.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.30.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.12.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.30.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.31.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.45.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.13.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.31.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.31.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.45.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.13.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.31.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.45.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.31.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.13.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.46.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.31.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.31.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.13.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) + + + +_get_param_groups name module.module.decoder.layers.31.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.31.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.13.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.46.self_attention.linear_proj.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.31.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.31.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.13.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.46.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.32.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.31.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.46.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.13.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.31.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.32.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.14.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.46.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.32.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.32.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.14.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.46.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.external_feature_model.vision_projection.encoder.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.32.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.32.self_attention.linear_proj.weight key (1.0, 1.0, False, False) + + + + +_get_param_groups name module.module.decoder.layers.14.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.46.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.32.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.32.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.14.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.32.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.external_feature_model.pre_proj_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.32.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.14.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.32.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.32.mlp.linear_fc2.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.47.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.external_feature_model.pre_proj_layernorm.bias key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.14.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.32.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.33.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.47.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.embedding.word_embeddings.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.14.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.32.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.33.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.47.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.15.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.33.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.0.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.47.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.33.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.15.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.33.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.33.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.47.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.0.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.15.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.33.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.33.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.47.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.0.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.15.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.33.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.33.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.47.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.0.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.15.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.33.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.33.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.final_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.0.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.15.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.33.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.output_layer.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.34.input_layernorm.weight key (0.0, 1.0, False, False) + + + + +_get_param_groups name module.module.decoder.layers.33.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.15.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.34.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.0.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.34.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.16.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.34.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.0.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.34.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.34.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.16.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.1.input_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.34.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.34.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.16.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.1.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.34.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.16.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.34.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.1.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.34.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.34.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.16.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.1.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.34.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.35.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.16.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.34.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.1.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.16.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.35.self_attention.linear_proj.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.35.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.1.mlp.linear_fc1.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.17.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.35.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.1.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.35.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.35.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.17.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.2.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.35.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.35.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.35.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.17.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.2.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.17.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.35.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.2.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.35.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.35.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.2.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.17.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.35.mlp.linear_fc1.weight key (1.0, 1.0, False, False) + + + +_get_param_groups name module.module.decoder.layers.35.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.36.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.17.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.2.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) + + + +_get_param_groups name module.module.decoder.layers.2.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.17.mlp.linear_fc2.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.36.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.36.self_attention.linear_proj.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.2.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.18.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.36.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.36.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.3.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.36.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.18.self_attention.linear_proj.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.36.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.18.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.3.self_attention.linear_proj.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.36.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.36.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.18.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.3.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.36.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.36.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.3.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.18.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.36.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.36.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.3.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.18.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.36.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.37.input_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.3.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.18.mlp.linear_fc2.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.37.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.37.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.3.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.19.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.37.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.37.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.4.input_layernorm.weight key (0.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.19.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.37.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.37.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.19.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.4.self_attention.linear_proj.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.37.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.37.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.19.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.4.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.37.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.37.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.4.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.19.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.37.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.37.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.19.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.4.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.38.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.37.mlp.linear_fc2.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.19.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.4.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.38.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.38.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.20.input_layernorm.weight key (0.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.4.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.38.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.20.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.38.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.5.input_layernorm.weight key (0.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.38.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.20.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.38.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.5.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.38.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.20.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.38.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.5.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.38.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.5.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.20.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.38.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) + + + +_get_param_groups name module.module.decoder.layers.20.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.38.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.38.mlp.linear_fc1.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.5.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.20.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.38.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.39.input_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.5.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.21.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.39.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.5.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.39.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.6.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.21.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.39.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.39.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) + + + +_get_param_groups name module.module.decoder.layers.39.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.21.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.39.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.6.self_attention.linear_proj.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.39.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.21.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.39.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.6.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) + + + +_get_param_groups name module.module.decoder.layers.6.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.39.mlp.linear_fc1.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.21.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.39.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.39.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.6.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.21.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.39.mlp.linear_fc1.weight key (1.0, 1.0, False, False) + + + +_get_param_groups name module.module.decoder.layers.39.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.21.mlp.linear_fc2.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.6.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.40.input_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.40.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.22.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.6.mlp.linear_fc2.weight key (1.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.40.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.40.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.7.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.40.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.22.self_attention.linear_proj.weight key (1.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.40.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.40.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.7.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.22.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.40.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.22.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.7.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.40.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.7.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.40.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.22.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.40.mlp.linear_fc1.weight key (1.0, 1.0, False, False) + + + +_get_param_groups name module.module.decoder.layers.40.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.22.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.40.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.7.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.40.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.22.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.41.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.7.mlp.linear_fc1.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.41.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.23.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.7.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.41.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.41.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.8.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.23.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.41.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.41.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.41.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.23.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.8.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.41.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.23.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.41.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.8.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.41.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.8.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.41.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.23.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.41.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.41.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.8.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.23.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.41.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.42.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.8.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.23.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.42.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.8.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.42.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.24.input_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.42.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.9.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.42.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.24.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.42.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.42.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.42.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.24.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.9.self_attention.linear_proj.weight key (1.0, 1.0, False, False) + + + +_get_param_groups name module.module.decoder.layers.24.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.9.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.42.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.42.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.9.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.24.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.42.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.42.mlp.linear_fc1.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.9.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.24.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.42.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.42.mlp.linear_fc2.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.9.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.24.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.43.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.43.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.9.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.25.input_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.10.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.43.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.43.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.25.self_attention.linear_proj.weight key (1.0, 1.0, False, False) + + + +_get_param_groups name module.module.decoder.layers.43.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.25.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.10.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.43.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.43.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.25.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.43.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.10.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.43.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.10.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.43.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.25.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.43.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.10.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.43.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.43.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.25.mlp.linear_fc1.weight key (1.0, 1.0, False, False) + + + +_get_param_groups name module.module.decoder.layers.44.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.10.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.43.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.25.mlp.linear_fc2.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.10.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.44.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.44.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.26.input_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.11.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.44.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.44.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.26.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.44.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.11.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.44.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.26.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.11.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.44.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.44.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.26.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.11.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.44.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.44.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.44.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.26.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.11.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.44.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.45.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.11.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.26.mlp.linear_fc1.weight key (1.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.44.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.11.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.26.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.45.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.45.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.12.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.27.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.45.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.45.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.45.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.12.self_attention.linear_proj.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.27.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.45.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.12.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.45.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.45.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.27.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.12.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.27.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.45.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.45.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.12.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.45.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.27.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.45.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.12.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.46.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.45.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.27.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.12.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.46.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.27.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.46.self_attention.linear_proj.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.13.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.46.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.28.input_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.46.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.13.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.46.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.46.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.28.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.13.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.46.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.28.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.46.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.13.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.28.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.46.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.46.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.13.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.46.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.46.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.28.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.13.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.46.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.13.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.28.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.28.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.14.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.47.input_layernorm.weight key (0.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.47.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.29.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.14.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.47.self_attention.linear_proj.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.47.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.14.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.47.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.29.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.47.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) + + + +_get_param_groups name module.module.decoder.layers.14.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.47.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.29.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.47.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.29.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.14.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.47.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.47.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.29.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.14.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.47.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.47.mlp.linear_fc1.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.29.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.14.mlp.linear_fc2.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.47.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.47.mlp.linear_fc2.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.29.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.15.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.final_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.final_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.30.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.15.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.output_layer.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.output_layer.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.30.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.15.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.15.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.30.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.30.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.15.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.30.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.15.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.30.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.15.mlp.linear_fc2.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.30.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.16.input_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.31.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.16.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.31.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.16.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.31.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.16.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.31.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.16.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.31.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.16.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.31.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.16.mlp.linear_fc2.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.31.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.17.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.32.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.17.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.32.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.17.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.32.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.17.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.32.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.17.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.17.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.32.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.17.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.32.mlp.linear_fc1.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.32.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.18.input_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.33.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.18.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.18.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.33.self_attention.linear_proj.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.external_feature_model.vision_projection.encoder.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.18.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.33.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.33.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.18.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.18.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.33.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.18.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.33.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.19.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.33.mlp.linear_fc2.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.34.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.19.self_attention.linear_proj.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.19.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.34.self_attention.linear_proj.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.19.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.34.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.34.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.19.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.external_feature_model.vision_projection.encoder.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.19.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.34.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.19.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.34.mlp.linear_fc1.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.20.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.34.mlp.linear_fc2.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.external_feature_model.pre_proj_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.35.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.20.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.20.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.35.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.external_feature_model.pre_proj_layernorm.bias key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.20.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.35.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.35.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.embedding.word_embeddings.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.20.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.35.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.20.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.20.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.35.mlp.linear_fc1.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.0.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.35.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.21.input_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.36.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.21.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.36.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.21.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.0.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.21.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.36.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.36.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.0.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.21.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.36.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.21.mlp.linear_fc1.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.0.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.21.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.36.mlp.linear_fc1.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.0.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.36.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.22.input_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.37.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.22.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.0.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.22.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.37.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.22.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.37.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.0.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.37.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.22.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.1.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.22.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.37.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.22.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.37.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.23.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.37.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.1.self_attention.linear_proj.weight key (1.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.38.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.23.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.1.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.23.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.38.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.23.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.38.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.1.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.38.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.23.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.23.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.38.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.1.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.23.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.38.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.24.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.38.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.1.mlp.linear_fc1.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.24.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.39.input_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.1.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.24.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.39.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.24.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.2.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.39.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.24.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.39.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.24.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.2.self_attention.linear_proj.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.39.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.24.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.39.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.2.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.25.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.39.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.2.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.25.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.40.input_layernorm.weight key (0.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.25.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.40.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.25.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.40.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.2.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.40.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.25.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.40.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.25.mlp.linear_fc1.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.25.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.40.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.2.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.40.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.26.input_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.41.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.2.mlp.linear_fc2.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.26.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.26.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.41.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.3.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.26.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.41.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.41.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.26.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.3.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.26.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.41.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.3.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.26.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.41.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.3.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.27.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.41.mlp.linear_fc2.weight key (1.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.42.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.27.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.3.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.27.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.42.self_attention.linear_proj.weight key (1.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.27.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.42.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.3.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.42.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.27.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.3.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.42.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.27.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.42.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.27.mlp.linear_fc2.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.4.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.42.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.28.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.43.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.4.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.28.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.43.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.28.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.4.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.43.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.28.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.4.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.43.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.28.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.43.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.28.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.43.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.4.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.28.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.43.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.29.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.4.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.44.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.29.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.4.mlp.linear_fc2.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.44.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.29.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.44.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.5.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.29.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.44.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.44.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.29.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.5.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.29.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.44.mlp.linear_fc1.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.5.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.29.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.44.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.30.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.45.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.30.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.5.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.45.self_attention.linear_proj.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.30.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.45.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.30.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.45.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.5.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.30.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.45.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.30.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.5.mlp.linear_fc1.weight key (1.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.30.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.45.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.5.mlp.linear_fc2.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.45.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.31.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.6.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.46.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.31.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.6.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.46.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.31.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.6.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.46.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.31.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.6.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.46.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.31.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.46.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.6.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.31.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.46.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.6.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.31.mlp.linear_fc2.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.46.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.6.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.32.input_layernorm.weight key (0.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.7.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.32.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.32.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.47.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.7.self_attention.linear_proj.weight key (1.0, 1.0, False, False) + + +_get_param_groups name module.module.decoder.layers.32.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.7.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.47.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.7.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.32.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.47.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.32.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.47.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.7.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.32.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.47.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.7.mlp.linear_fc1.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.33.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.47.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.7.mlp.linear_fc2.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.33.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.47.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.8.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.33.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.final_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.8.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.33.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.output_layer.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.8.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.33.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.8.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.33.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.33.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.8.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.34.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.8.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.34.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.8.mlp.linear_fc2.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.34.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.9.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.34.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.9.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.34.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.9.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.34.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.9.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.34.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.35.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.9.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.35.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.9.mlp.linear_fc1.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.35.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.9.mlp.linear_fc2.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.35.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.10.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.35.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.10.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.35.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.10.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.35.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.10.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.36.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.10.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.36.self_attention.linear_proj.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.36.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.10.mlp.linear_fc1.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.36.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.10.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.36.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.11.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.36.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.11.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.36.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.11.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.37.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.11.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.37.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.11.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.37.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.11.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.37.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.11.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.37.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.37.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.12.input_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.37.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.12.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.38.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.12.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.38.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.12.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.38.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.12.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.38.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.12.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.38.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.12.mlp.linear_fc2.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.38.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.13.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.38.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.13.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.39.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.13.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.39.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.13.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.39.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.13.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.39.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.13.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.39.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.13.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.39.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.39.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.14.input_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.40.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.14.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.40.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.14.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.14.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.40.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.40.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.14.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.40.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.14.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.40.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.14.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.40.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.15.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.41.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.15.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.41.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.15.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.41.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.15.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.41.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.15.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.41.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.15.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.41.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.15.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.41.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.16.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.42.input_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.42.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.16.self_attention.linear_proj.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.42.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.16.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.42.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.16.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.42.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.16.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.42.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.16.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.42.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.16.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.43.input_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.17.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.43.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.43.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.17.self_attention.linear_proj.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.43.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.17.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.43.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.17.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.43.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.17.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.43.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.17.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.44.input_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.17.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.44.self_attention.linear_proj.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.44.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.18.input_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.44.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.18.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.44.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.18.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.44.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.18.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.44.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.18.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.45.input_layernorm.weight key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.18.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.45.self_attention.linear_proj.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.45.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.18.mlp.linear_fc2.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.45.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.19.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.45.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.19.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.45.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.19.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.45.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.19.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.46.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.19.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.46.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.19.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.46.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.46.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.19.mlp.linear_fc2.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.20.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.46.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.46.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.20.self_attention.linear_proj.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.46.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.20.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.20.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.47.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.20.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.47.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.20.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.47.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.20.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.47.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.layers.47.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.21.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.47.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.21.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.47.mlp.linear_fc2.weight key (1.0, 1.0, False, False) + +_get_param_groups name module.module.decoder.final_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.21.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.output_layer.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.21.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.21.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.21.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.21.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.22.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.22.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.22.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.22.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.22.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.22.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.22.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.23.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.23.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.23.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.23.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.23.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.23.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.23.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.24.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.24.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.24.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.24.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.24.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.24.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.24.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.25.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.25.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.25.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.25.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.25.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.25.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.25.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.26.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.26.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.26.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.26.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.26.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.26.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.26.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.27.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.27.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.27.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.27.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.27.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.27.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.27.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.28.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.28.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.28.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.28.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.28.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.28.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.28.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.29.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.29.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.29.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.29.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.29.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.29.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.29.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.30.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.30.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.30.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.30.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.30.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.30.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.30.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.31.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.31.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.31.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.31.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.31.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.31.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.31.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.32.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.32.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.32.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.32.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.32.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.32.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.32.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.33.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.33.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.33.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.33.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.33.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.33.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.33.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.34.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.34.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.34.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.34.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.34.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.34.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.34.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.35.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.35.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.35.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.35.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.35.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.35.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.35.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.36.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.36.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.36.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.36.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.36.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.36.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.36.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.37.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.37.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.37.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.37.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.37.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.37.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.37.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.38.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.38.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.38.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.38.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.38.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.38.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.38.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.39.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.39.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.39.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.39.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.39.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.39.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.39.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.40.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.40.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.40.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.40.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.40.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.40.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.40.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.41.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.41.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.41.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.41.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.41.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.41.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.41.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.42.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.42.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.42.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.42.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.42.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.42.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.42.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.43.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.43.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.43.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.43.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.43.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.43.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.43.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.44.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.44.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.44.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.44.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.44.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.44.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.44.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.45.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.45.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.45.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.45.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.45.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.45.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.45.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.46.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.46.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.46.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.46.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.46.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.46.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.46.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.47.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.47.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.47.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.47.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.47.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.47.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.47.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.final_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.output_layer.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.external_feature_model.vision_projection.encoder.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.external_feature_model.vision_projection.encoder.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.external_feature_model.pre_proj_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.external_feature_model.pre_proj_layernorm.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.embedding.word_embeddings.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.0.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.0.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.0.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.0.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.0.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.0.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.0.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.1.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.1.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.1.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.1.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.1.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.1.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.1.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.2.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.2.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.2.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.2.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.2.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.2.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.2.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.3.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.3.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.3.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.3.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.3.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.3.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.3.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.4.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.4.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.4.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.4.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.4.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.4.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.4.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.5.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.5.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.5.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.5.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.5.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.5.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.5.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.6.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.6.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.6.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.6.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.6.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.6.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.6.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.7.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.7.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.7.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.7.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.7.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.7.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.7.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.8.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.8.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.8.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.8.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.8.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.8.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.8.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.9.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.9.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.9.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.9.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.9.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.9.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.9.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.10.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.10.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.10.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.10.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.10.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.10.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.10.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.11.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.11.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.11.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.11.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.11.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.11.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.11.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.12.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.12.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.12.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.12.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.12.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.12.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.12.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.13.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.13.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.13.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.13.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.13.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.13.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.13.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.14.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.14.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.14.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.14.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.14.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.14.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.14.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.15.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.15.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.15.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.15.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.15.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.15.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.15.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.16.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.16.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.16.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.16.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.16.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.16.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.16.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.17.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.17.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.17.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.17.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.17.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.17.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.17.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.18.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.18.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.18.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.18.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.18.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.18.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.18.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.19.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.19.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.19.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.19.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.19.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.19.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.19.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.20.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.20.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.20.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.20.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.20.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.20.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.20.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.21.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.21.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.21.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.21.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.21.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.21.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.21.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.22.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.22.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.22.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.22.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.22.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.22.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.22.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.23.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.23.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.23.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.23.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.23.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.23.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.23.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.24.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.24.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.24.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.24.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.24.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.24.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.24.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.25.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.25.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.25.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.25.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.25.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.25.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.25.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.26.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.26.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.26.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.26.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.26.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.26.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.26.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.27.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.27.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.27.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.27.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.27.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.27.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.27.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.28.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.28.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.28.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.28.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.28.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.28.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.28.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.29.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.29.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.29.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.29.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.29.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.29.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.29.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.30.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.30.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.30.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.30.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.30.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.30.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.30.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.31.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.31.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.31.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.31.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.31.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.31.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.31.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.32.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.32.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.32.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.32.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.32.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.32.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.32.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.33.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.33.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.33.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.33.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.33.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.33.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.33.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.34.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.34.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.34.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.34.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.34.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.34.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.34.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.35.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.35.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.35.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.35.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.35.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.35.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.35.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.36.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.36.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.36.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.36.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.36.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.36.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.36.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.37.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.37.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.37.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.37.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.37.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.37.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.37.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.38.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.38.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.38.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.38.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.38.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.38.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.38.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.39.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.39.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.39.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.39.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.39.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.39.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.39.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.40.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.40.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.40.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.40.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.40.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.40.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.40.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.41.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.41.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.41.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.41.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.41.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.41.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.41.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.42.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.42.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.42.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.42.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.42.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.42.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.42.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.43.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.43.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.43.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.43.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.43.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.43.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.43.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.44.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.44.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.44.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.44.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.44.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.44.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.44.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.45.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.45.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.45.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.45.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.45.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.45.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.45.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.46.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.46.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.46.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.46.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.46.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.46.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.46.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.47.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.47.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.47.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.47.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.47.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.47.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.47.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.final_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.output_layer.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.external_feature_model.vision_projection.encoder.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.external_feature_model.vision_projection.encoder.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.external_feature_model.pre_proj_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.external_feature_model.pre_proj_layernorm.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.embedding.word_embeddings.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.0.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.0.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.0.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.0.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.0.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.0.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.0.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.1.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.1.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.1.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.1.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.1.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.1.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.1.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.2.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.2.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.2.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.2.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.2.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.2.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.2.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.3.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.3.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.3.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.3.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.3.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.3.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.3.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.4.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.4.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.4.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.4.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.4.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.4.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.4.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.5.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.5.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.5.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.5.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.5.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.5.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.5.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.6.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.6.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.6.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.6.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.6.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.6.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.6.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.7.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.7.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.7.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.7.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.7.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.7.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.7.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.8.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.8.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.8.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.8.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.8.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.8.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.8.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.9.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.9.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.9.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.9.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.9.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.9.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.9.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.10.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.10.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.10.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.10.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.10.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.10.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.10.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.11.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.11.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.11.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.11.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.11.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.11.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.11.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.12.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.12.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.12.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.12.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.12.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.12.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.12.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.13.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.13.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.13.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.13.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.13.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.13.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.13.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.14.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.14.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.14.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.14.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.14.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.14.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.14.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.15.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.15.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.15.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.15.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.15.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.15.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.15.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.16.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.16.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.16.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.16.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.16.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.16.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.16.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.17.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.17.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.17.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.17.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.17.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.17.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.17.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.18.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.18.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.18.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.18.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.18.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.18.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.18.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.19.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.19.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.19.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.19.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.19.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.19.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.19.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.20.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.20.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.20.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.20.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.20.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.20.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.20.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.21.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.21.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.21.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.21.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.21.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.21.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.21.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.22.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.22.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.22.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.22.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.22.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.22.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.22.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.23.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.23.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.23.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.23.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.23.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.23.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.23.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.24.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.24.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.24.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.24.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.24.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.24.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.24.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.25.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.25.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.25.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.25.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.25.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.25.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.25.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.26.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.26.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.26.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.26.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.26.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.26.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.26.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.27.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.27.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.27.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.27.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.27.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.27.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.27.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.28.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.28.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.28.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.28.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.28.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.28.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.28.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.29.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.29.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.29.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.29.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.29.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.29.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.29.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.30.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.30.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.30.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.30.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.30.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.30.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.30.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.31.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.31.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.31.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.31.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.31.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.31.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.31.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.32.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.32.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.32.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.32.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.32.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.32.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.32.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.33.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.33.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.33.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.33.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.33.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.33.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.33.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.34.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.34.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.34.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.34.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.34.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.34.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.34.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.35.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.35.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.35.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.35.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.35.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.35.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.35.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.36.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.36.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.36.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.36.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.36.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.36.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.36.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.37.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.37.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.37.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.37.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.37.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.37.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.37.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.38.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.38.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.38.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.38.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.38.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.38.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.38.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.39.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.39.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.39.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.39.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.39.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.39.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.39.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.40.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.40.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.40.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.40.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.40.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.40.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.40.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.41.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.41.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.41.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.41.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.41.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.41.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.41.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.42.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.42.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.42.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.42.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.42.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.42.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.42.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.43.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.43.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.43.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.43.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.43.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.43.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.43.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.44.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.44.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.44.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.44.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.44.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.44.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.44.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.45.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.45.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.45.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.45.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.45.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.45.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.45.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.46.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.46.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.46.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.46.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.46.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.46.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.46.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.47.input_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.47.self_attention.linear_proj.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.47.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.47.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.47.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.47.mlp.linear_fc1.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.layers.47.mlp.linear_fc2.weight key (1.0, 1.0, False, False) +_get_param_groups name module.module.decoder.final_layernorm.weight key (0.0, 1.0, False, False) +_get_param_groups name module.module.output_layer.weight key (1.0, 1.0, False, False) +_load_base_checkpoint iteration_load_base_checkpoint iteration_load_base_checkpoint iteration_load_base_checkpoint iteration_load_base_checkpoint iteration_load_base_checkpoint iteration_load_base_checkpoint iteration_load_base_checkpoint iteration_load_base_checkpoint iteration_load_base_checkpoint iteration _load_base_checkpoint iteration 5000 _load_base_checkpoint iteration 5000 50005000_load_base_checkpoint iteration_load_base_checkpoint iteration_load_base_checkpoint iteration +50005000 + 5000_load_base_checkpoint iteration50005000 + + +_load_base_checkpoint release_load_base_checkpoint release_load_base_checkpoint release_load_base_checkpoint release +False +_load_base_checkpoint release 50005000 + + + False _load_base_checkpoint release5000 5000 + +_load_base_checkpoint release5000_load_base_checkpoint release_load_base_checkpoint release 5000FalseFalse +False +False5000False_load_base_checkpoint release + _load_base_checkpoint release _load_base_checkpoint release + + + + +_load_base_checkpoint release + + + False FalseFalse_load_base_checkpoint release_load_base_checkpoint releaseFalse_load_base_checkpoint release +False +False + + False + +FalseFalse + + +_load_base_checkpoint /data_2/output/LM/lcvlm_modellink/scripts/qwen25/finetune_qwen25_14b_intern_300m_ptd_tp8pp1_stage2.sh/20241014_131952/iter_0005000/mp_rank_06/model_optim_rng.pt +_load_base_checkpoint /data_2/output/LM/lcvlm_modellink/scripts/qwen25/finetune_qwen25_14b_intern_300m_ptd_tp8pp1_stage2.sh/20241014_131952/iter_0005000/mp_rank_07/model_optim_rng.pt +_load_base_checkpoint /data_2/output/LM/lcvlm_modellink/scripts/qwen25/finetune_qwen25_14b_intern_300m_ptd_tp8pp1_stage2.sh/20241014_131952/iter_0005000/mp_rank_03/model_optim_rng.pt +_load_base_checkpoint /data_2/output/LM/lcvlm_modellink/scripts/qwen25/finetune_qwen25_14b_intern_300m_ptd_tp8pp1_stage2.sh/20241014_131952/iter_0005000/mp_rank_02/model_optim_rng.pt +_load_base_checkpoint /data_2/output/LM/lcvlm_modellink/scripts/qwen25/finetune_qwen25_14b_intern_300m_ptd_tp8pp1_stage2.sh/20241014_131952/iter_0005000/mp_rank_04/model_optim_rng.pt +_load_base_checkpoint /data_2/output/LM/lcvlm_modellink/scripts/qwen25/finetune_qwen25_14b_intern_300m_ptd_tp8pp1_stage2.sh/20241014_131952/iter_0005000/mp_rank_05/model_optim_rng.pt +_load_base_checkpoint /data_2/output/LM/lcvlm_modellink/scripts/qwen25/finetune_qwen25_14b_intern_300m_ptd_tp8pp1_stage2.sh/20241014_131952/iter_0005000/mp_rank_02/model_optim_rng.pt +_load_base_checkpoint /data_2/output/LM/lcvlm_modellink/scripts/qwen25/finetune_qwen25_14b_intern_300m_ptd_tp8pp1_stage2.sh/20241014_131952/iter_0005000/mp_rank_01/model_optim_rng.pt +_load_base_checkpoint /data_2/output/LM/lcvlm_modellink/scripts/qwen25/finetune_qwen25_14b_intern_300m_ptd_tp8pp1_stage2.sh/20241014_131952/iter_0005000/mp_rank_04/model_optim_rng.pt +_load_base_checkpoint /data_2/output/LM/lcvlm_modellink/scripts/qwen25/finetune_qwen25_14b_intern_300m_ptd_tp8pp1_stage2.sh/20241014_131952/iter_0005000/mp_rank_07/model_optim_rng.pt +_load_base_checkpoint /data_2/output/LM/lcvlm_modellink/scripts/qwen25/finetune_qwen25_14b_intern_300m_ptd_tp8pp1_stage2.sh/20241014_131952/iter_0005000/mp_rank_05/model_optim_rng.pt +_load_base_checkpoint /data_2/output/LM/lcvlm_modellink/scripts/qwen25/finetune_qwen25_14b_intern_300m_ptd_tp8pp1_stage2.sh/20241014_131952/iter_0005000/mp_rank_01/model_optim_rng.pt +_load_base_checkpoint /data_2/output/LM/lcvlm_modellink/scripts/qwen25/finetune_qwen25_14b_intern_300m_ptd_tp8pp1_stage2.sh/20241014_131952/iter_0005000/mp_rank_03/model_optim_rng.pt +_load_base_checkpoint /data_2/output/LM/lcvlm_modellink/scripts/qwen25/finetune_qwen25_14b_intern_300m_ptd_tp8pp1_stage2.sh/20241014_131952/iter_0005000/mp_rank_06/model_optim_rng.pt +_load_base_checkpoint /data_2/output/LM/lcvlm_modellink/scripts/qwen25/finetune_qwen25_14b_intern_300m_ptd_tp8pp1_stage2.sh/20241014_131952/iter_0005000/mp_rank_00/model_optim_rng.pt +_load_base_checkpoint /data_2/output/LM/lcvlm_modellink/scripts/qwen25/finetune_qwen25_14b_intern_300m_ptd_tp8pp1_stage2.sh/20241014_131952/iter_0005000/mp_rank_00/model_optim_rng.pt +load_checkpoint iteration 0 +load_checkpoint release False +strict True +load_checkpoint iteration 0 +load_checkpoint release False +strict True +load_checkpoint iteration 0 +load_checkpoint release False +strict True +load_checkpoint iteration 0 +load_checkpoint release False +strict True +load_checkpoint iteration 0 +load_checkpoint release False +strict True +load_checkpoint iteration 0 +load_checkpoint release False +strict True +load_checkpoint iteration 0 +load_checkpoint release False +strict True +load_checkpoint iteration 0 +load_checkpoint release False +strict True +load_checkpoint iteration 0 +load_checkpoint release False +strict True +load_checkpoint iteration 0 +load_checkpoint release False +strict True +load_checkpoint iteration 0 +load_checkpoint release False +strict True +load_checkpoint iteration 0 +load_checkpoint release False +strict True +load_checkpoint iteration 0 +load_checkpoint release False +strict True +load_checkpoint iteration 0 +load_checkpoint release False +strict True +load_checkpoint iteration 0 +load_checkpoint release False +strict True +load_checkpoint iteration 0 +load_checkpoint release False +strict True +(min, max) time across ranks (ms): + load-checkpoint ................................: (11120.76, 11123.74) +> rank 500 does not create GPT datasets ... +> rank 497 does not create GPT datasets ... +> rank 507 does not create GPT datasets ...> rank 499 does not create GPT datasets ... + +> rank 506 does not create GPT datasets ... +> rank 498 does not create GPT datasets ... +> rank 501 does not create GPT datasets ... +> rank 509 does not create GPT datasets ... +> rank 502 does not create GPT datasets ... +> rank 510 does not create GPT datasets ...> rank 505 does not create GPT datasets ... + +> rank 511 does not create GPT datasets ... +> rank 503 does not create GPT datasets ... +> rank 508 does not create GPT datasets ... +> rank 496 is creating GPT datasets ... +> rank 504 is creating GPT datasets ... +target_ratios [(1, 1), (1, 2), (2, 1), (3, 1), (1, 3), (2, 2), (4, 1), (1, 4), (5, 1), (1, 5), (1, 6), (6, 1), (3, 2), (2, 3), (7, 1), (1, 7), (4, 2), (2, 4), (1, 8), (8, 1), (1, 9), (3, 3), (9, 1), (2, 5), (5, 2), (10, 1), (1, 10), (11, 1), (1, 11), (12, 1), (3, 4), (4, 3), (1, 12), (6, 2), (2, 6)] +possible_resolutions [[448, 448], [448, 896], [896, 448], [1344, 448], [448, 1344], [896, 896], [1792, 448], [448, 1792], [2240, 448], [448, 2240], [448, 2688], [2688, 448], [1344, 896], [896, 1344], [3136, 448], [448, 3136], [1792, 896], [896, 1792], [448, 3584], [3584, 448], [448, 4032], [1344, 1344], [4032, 448], [896, 2240], [2240, 896], [4480, 448], [448, 4480], [4928, 448], [448, 4928], [5376, 448], [1344, 1792], [1792, 1344], [448, 5376], [2688, 896], [896, 2688]] +target_ratios [(1, 1), (1, 2), (2, 1), (3, 1), (1, 3), (2, 2), (4, 1), (1, 4), (5, 1), (1, 5), (1, 6), (6, 1), (3, 2), (2, 3), (7, 1), (1, 7), (4, 2), (2, 4), (1, 8), (8, 1), (1, 9), (3, 3), (9, 1), (2, 5), (5, 2), (10, 1), (1, 10), (11, 1), (1, 11), (12, 1), (3, 4), (4, 3), (1, 12), (6, 2), (2, 6)] +possible_resolutions [[448, 448], [448, 896], [896, 448], [1344, 448], [448, 1344], [896, 896], [1792, 448], [448, 1792], [2240, 448], [448, 2240], [448, 2688], [2688, 448], [1344, 896], [896, 1344], [3136, 448], [448, 3136], [1792, 896], [896, 1792], [448, 3584], [3584, 448], [448, 4032], [1344, 1344], [4032, 448], [896, 2240], [2240, 896], [4480, 448], [448, 4480], [4928, 448], [448, 4928], [5376, 448], [1344, 1792], [1792, 1344], [448, 5376], [2688, 896], [896, 2688]] +(min, max) time across ranks (ms): + model-and-optimizer-setup ......................: (11717.92, 11729.42) + train/valid/test-data-iterators-setup ..........: (287040.22, 287455.78) +[h264 @ 0x555df4961e80] mmco: unref short failure +[h264 @ 0x555df4961e80] mmco: unref short failure +[h264 @ 0x55d95b718d00] mmco: unref short failure +[h264 @ 0x55d95b718d00] mmco: unref short failure +[h264 @ 0x555df4961e80] mmco: unref short failure +[h264 @ 0x55d95b718d00] mmco: unref short failure +[h264 @ 0x555df4961e80] mmco: unref short failure +[h264 @ 0x55d95b718d00] mmco: unref short failure +[h264 @ 0x555df4961e80] mmco: unref short failure +[h264 @ 0x55d95b718d00] mmco: unref short failure +[h264 @ 0x555df4961e80] mmco: unref short failure +[h264 @ 0x55d95b718d00] mmco: unref short failure +[h264 @ 0x555df4961e80] mmco: unref short failure +[h264 @ 0x55d95b718d00] mmco: unref short failure +[h264 @ 0x555df4961e80] mmco: unref short failure +[h264 @ 0x55d95b718d00] mmco: unref short failure +[h264 @ 0x555df4961e80] mmco: unref short failure +[h264 @ 0x555df4961e80] mmco: unref short failure +[h264 @ 0x55d95b718d00] mmco: unref short failure +[h264 @ 0x55d95b718d00] mmco: unref short failure +[h264 @ 0x555df4961e80] mmco: unref short failure +[h264 @ 0x55d95b718d00] mmco: unref short failure +[h264 @ 0x555df4961e80] mmco: unref short failure +[h264 @ 0x55d95b718d00] mmco: unref short failure +[h264 @ 0x55d95b718d00] mmco: unref short failure +[h264 @ 0x55d95b718d00] mmco: unref short failure +[h264 @ 0x555df4961e80] mmco: unref short failure +[h264 @ 0x555df4961e80] mmco: unref short failure +[h264 @ 0x55d95b718d00] mmco: unref short failure +[h264 @ 0x555df4961e80] mmco: unref short failure +[h264 @ 0x555df4961e80] mmco: unref short failure +[h264 @ 0x555df4961e80] mmco: unref short failure +[h264 @ 0x55d95b718d00] mmco: unref short failure +[h264 @ 0x55d95b718d00] mmco: unref short failure +[h264 @ 0x555df4961e80] mmco: unref short failure +[h264 @ 0x55d95b718d00] mmco: unref short failure +[h264 @ 0x555df4961e80] mmco: unref short failure +[h264 @ 0x55d95b718d00] mmco: unref short failure +[h264 @ 0x555df4961e80] mmco: unref short failure +[h264 @ 0x55d95b718d00] mmco: unref short failure +[h264 @ 0x555df4961e80] mmco: unref short failure +[h264 @ 0x555df4961e80] mmco: unref short failure +[h264 @ 0x55d95b718d00] mmco: unref short failure +[h264 @ 0x55d95b718d00] mmco: unref short failure +[h264 @ 0x555df4961e80] mmco: unref short failure +[h264 @ 0x555df4961e80] mmco: unref short failure +[h264 @ 0x55d95b718d00] mmco: unref short failure +[h264 @ 0x55d95b718d00] mmco: unref short failure +[h264 @ 0x555df4961e80] mmco: unref short failure +[h264 @ 0x55d95b718d00] mmco: unref short failure +[h264 @ 0x555df4961e80] mmco: unref short failure +[h264 @ 0x555df4961e80] mmco: unref short failure +[h264 @ 0x55d95b718d00] mmco: unref short failure +[h264 @ 0x55d95b718d00] mmco: unref short failure +[h264 @ 0x555df4961e80] mmco: unref short failure +[h264 @ 0x555df4961e80] mmco: unref short failure +[h264 @ 0x55d95b718d00] mmco: unref short failure +[h264 @ 0x55d95b718d00] mmco: unref short failure +[h264 @ 0x55d95b718d00] mmco: unref short failure +[h264 @ 0x555df4961e80] mmco: unref short failure +[h264 @ 0x55d95b718d00] mmco: unref short failure +[h264 @ 0x55d95b718d00] mmco: unref short failure +[h264 @ 0x555df4961e80] mmco: unref short failure +[h264 @ 0x555df4961e80] mmco: unref short failure +[h264 @ 0x555df4961e80] mmco: unref short failure +[h264 @ 0x555df4961e80] mmco: unref short failure +[h264 @ 0x555df4961e80] mmco: unref short failure +[h264 @ 0x555df4961e80] mmco: unref short failure +[h264 @ 0x55d95b718d00] mmco: unref short failure +[h264 @ 0x55d95b718d00] mmco: unref short failure +[h264 @ 0x555df4961e80] mmco: unref short failure +[h264 @ 0x555df4961e80] mmco: unref short failure +[h264 @ 0x55d95b718d00] mmco: unref short failure +[h264 @ 0x55d95b718d00] mmco: unref short failure +[h264 @ 0x555df4961e80] mmco: unref short failure +[h264 @ 0x555df4961e80] mmco: unref short failure +[h264 @ 0x55d95b718d00] mmco: unref short failure +[h264 @ 0x55d95b718d00] mmco: unref short failure +[h264 @ 0x55d95b718d00] mmco: unref short failure +[h264 @ 0x55d95b718d00] mmco: unref short failure +[h264 @ 0x555df4961e80] mmco: unref short failure +[h264 @ 0x555df4961e80] mmco: unref short failure +[h264 @ 0x55d95b718d00] mmco: unref short failure +[h264 @ 0x55d95b718d00] mmco: unref short failure +[h264 @ 0x55d95b718d00] mmco: unref short failure +[h264 @ 0x55d95b718d00] mmco: unref short failure +[h264 @ 0x555df4961e80] mmco: unref short failure +[h264 @ 0x555df4961e80] mmco: unref short failure +[h264 @ 0x555df4961e80] mmco: unref short failure +[h264 @ 0x555df4961e80] mmco: unref short failure +[h264 @ 0x55d95b718d00] mmco: unref short failure +[h264 @ 0x55d95b718d00] mmco: unref short failure +[h264 @ 0x555df4961e80] mmco: unref short failure +[h264 @ 0x555df4961e80] mmco: unref short failure +[h264 @ 0x55d95b718d00] mmco: unref short failure +[h264 @ 0x55d95b718d00] mmco: unref short failure +[h264 @ 0x555df4961e80] mmco: unref short failure +[h264 @ 0x55d95b718d00] mmco: unref short failure +[h264 @ 0x55d95b718d00] mmco: unref short failure +[h264 @ 0x55d95b718d00] mmco: unref short failure +[h264 @ 0x555df4961e80] mmco: unref short failure +[h264 @ 0x555df4961e80] mmco: unref short failure +[h264 @ 0x55d95b718d00] mmco: unref short failure +[h264 @ 0x55d95b718d00] mmco: unref short failure +[h264 @ 0x555df4961e80] mmco: unref short failure +[h264 @ 0x555df4961e80] mmco: unref short failure +[h264 @ 0x55d95b718d00] mmco: unref short failure +[h264 @ 0x55d95b718d00] mmco: unref short failure +[h264 @ 0x555df4961e80] mmco: unref short failure +[h264 @ 0x555df4961e80] mmco: unref short failure +[h264 @ 0x555df4961e80] mmco: unref short failure +[h264 @ 0x555df4961e80] mmco: unref short failure +[h264 @ 0x55d958e05fc0] mmco: unref short failure +[h264 @ 0x55d958e05fc0] mmco: unref short failure +[h264 @ 0x55d95b718d00] mmco: unref short failure +[h264 @ 0x555df4961e80] mmco: unref short failure +[h264 @ 0x55d95b718d00] mmco: unref short failure +[h264 @ 0x555df4961e80] mmco: unref short failure +[h264 @ 0x555dedf05880] mmco: unref short failure +[h264 @ 0x555dedf05880] mmco: unref short failure +[h264 @ 0x55d956042040] mmco: unref short failure +[h264 @ 0x55d956042040] mmco: unref short failure +[h264 @ 0x555dedf05880] mmco: unref short failure +[h264 @ 0x555dedf05880] mmco: unref short failure +[h264 @ 0x55d956042040] mmco: unref short failure +[h264 @ 0x55d956042040] mmco: unref short failure +[h264 @ 0x555dedf05880] mmco: unref short failure +[h264 @ 0x555dedf05880] mmco: unref short failure +[h264 @ 0x55d956042040] mmco: unref short failure +[h264 @ 0x55d956042040] mmco: unref short failure +[h264 @ 0x555df4961e80] mmco: unref short failure +[h264 @ 0x555df4961e80] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x555df4961e80] mmco: unref short failure +[h264 @ 0x555df4961e80] mmco: unref short failure +[h264 @ 0x55d95b718d00] mmco: unref short failure +[h264 @ 0x55d956f36e40] mmco: unref short failure +[h264 @ 0x555def251b40] mmco: unref short failure +[h264 @ 0x555def251b40] mmco: unref short failure +[h264 @ 0x555def251b40] mmco: unref short failure +[h264 @ 0x55d956f36e40] mmco: unref short failure +[h264 @ 0x55d956f36e40] mmco: unref short failure +[h264 @ 0x555df4961e80] mmco: unref short failure +[h264 @ 0x55d95b718d00] mmco: unref short failure +[h264 @ 0x55d958e05fc0] mmco: unref short failure +[h264 @ 0x55d958e05fc0] mmco: unref short failure +[h264 @ 0x555dee792700] mmco: unref short failure +[h264 @ 0x555dee792700] mmco: unref short failure +[h264 @ 0x555def251b40] mmco: unref short failure +[h264 @ 0x55d956f36e40] mmco: unref short failure +[h264 @ 0x555def251b40] mmco: unref short failure +[h264 @ 0x555def251b40] mmco: unref short failure +[h264 @ 0x55d956f36e40] mmco: unref short failure +[h264 @ 0x55d956f36e40] mmco: unref short failure +[h264 @ 0x555def251b40] mmco: unref short failure +[h264 @ 0x55d956f36e40] mmco: unref short failure +[h264 @ 0x555def251b40] mmco: unref short failure +[h264 @ 0x55d956f36e40] mmco: unref short failure +[h264 @ 0x555df4961e80] mmco: unref short failure +[h264 @ 0x55d95b718d00] mmco: unref short failure +................................................................................................ [2024-11-27 12:54:56] iteration 1/ 1000 | consumed samples: 64 | elapsed time per iteration (ms): 279519.2 | throughput per GPU (TFLOP/s/GPU): 27.6 | learning rate: 1.666667E-07 | global batch size: 64 | lm loss: 1.334420E+00 | loss scale: 1.0 | grad norm: 7.828 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555df4961e80] mmco: unref short failure +[h264 @ 0x55d95b718d00] mmco: unref short failure +[h264 @ 0x555df4961e80] mmco: unref short failure +[h264 @ 0x555df4961e80] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x555df4961e80] mmco: unref short failure +[h264 @ 0x555df4961e80] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x555df4961e80] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x555def251b40] Missing reference picture, default is 65530 +[h264 @ 0x555def251b40] Missing reference picture, default is 65530 +[h264 @ 0x555def251b40] mmco: unref short failure +[h264 @ 0x555def251b40] mmco: unref short failure +[h264 @ 0x555def251b40] Missing reference picture, default is 65530 +[h264 @ 0x555def251b40] Missing reference picture, default is 65530 +[h264 @ 0x555def251b40] mmco: unref short failure +[h264 @ 0x555def251b40] mmco: unref short failure +[h264 @ 0x55d956f36e40] Missing reference picture, default is 65530 +[h264 @ 0x55d956f36e40] Missing reference picture, default is 65530 +[h264 @ 0x55d956f36e40] mmco: unref short failure +[h264 @ 0x55d956f36e40] mmco: unref short failure +[h264 @ 0x55d956f36e40] Missing reference picture, default is 65530 +[h264 @ 0x55d956f36e40] Missing reference picture, default is 65530 +[h264 @ 0x55d956f36e40] mmco: unref short failure +[h264 @ 0x55d956f36e40] mmco: unref short failure +[h264 @ 0x555def251b40] Missing reference picture, default is 65530 +[h264 @ 0x555def251b40] mmco: unref short failure +[h264 @ 0x555def251b40] mmco: unref short failure +[h264 @ 0x55d956f36e40] Missing reference picture, default is 65530 +[h264 @ 0x55d956f36e40] mmco: unref short failure +[h264 @ 0x55d956f36e40] mmco: unref short failure +[h264 @ 0x555def251b40] mmco: unref short failure +[h264 @ 0x55d956f36e40] mmco: unref short failure + [2024-11-27 12:57:02] iteration 2/ 1000 | consumed samples: 128 | elapsed time per iteration (ms): 126706.1 | throughput per GPU (TFLOP/s/GPU): 60.8 | learning rate: 3.333333E-07 | global batch size: 64 | lm loss: 9.522201E-01 | loss scale: 1.0 | grad norm: 2.493 | number of skipped iterations: 0 | number of nan iterations: 0 | + [2024-11-27 12:58:57] iteration 3/ 1000 | consumed samples: 192 | elapsed time per iteration (ms): 114541.0 | throughput per GPU (TFLOP/s/GPU): 67.3 | learning rate: 5.000000E-07 | global batch size: 64 | lm loss: 9.801381E-01 | loss scale: 1.0 | grad norm: 86.020 | number of skipped iterations: 0 | number of nan iterations: 0 | + [2024-11-27 13:00:25] iteration 4/ 1000 | consumed samples: 256 | elapsed time per iteration (ms): 88395.0 | throughput per GPU (TFLOP/s/GPU): 87.2 | learning rate: 6.666667E-07 | global batch size: 64 | lm loss: 9.349928E-01 | loss scale: 1.0 | grad norm: 4.715 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x55d95b718d00] mmco: unref short failure +[h264 @ 0x55d95b718d00] mmco: unref short failure +[h264 @ 0x555df4961e80] mmco: unref short failure +[h264 @ 0x555df4961e80] mmco: unref short failure +[h264 @ 0x555dedf05880] mmco: unref short failure +[h264 @ 0x55d9565f0700] mmco: unref short failure +[h264 @ 0x55d95b718d00] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure + [2024-11-27 13:01:40] iteration 5/ 1000 | consumed samples: 320 | elapsed time per iteration (ms): 74663.0 | throughput per GPU (TFLOP/s/GPU): 103.2 | learning rate: 8.333333E-07 | global batch size: 64 | lm loss: 8.840026E-01 | loss scale: 1.0 | grad norm: 3.307 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x55d955d4f640] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x555df4961e80] mmco: unref short failure +[h264 @ 0x555df4961e80] mmco: unref short failure +[h264 @ 0x55d95b718d00] mmco: unref short failure +[h264 @ 0x55d95b718d00] mmco: unref short failure +[h264 @ 0x555df4961e80] mmco: unref short failure +[h264 @ 0x555df4961e80] mmco: unref short failure +[h264 @ 0x55d95b718d00] mmco: unref short failure +[h264 @ 0x55d95b718d00] mmco: unref short failure +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x55d95c76c1c0] mmco: unref short failure +[h264 @ 0x55d95c76c1c0] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x55d959b29f40] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x55d959b29f40] mmco: unref short failure +[h264 @ 0x55d959b29f40] mmco: unref short failure +[h264 @ 0x55d95c76c1c0] mmco: unref short failure +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x55d95c76c1c0] mmco: unref short failure +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x55d95c76c1c0] mmco: unref short failure +[h264 @ 0x55d95c76c1c0] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x55d95c76c1c0] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x55d95b718d00] mmco: unref short failure +[h264 @ 0x55d95b718d00] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x55d95b718d00] mmco: unref short failure +[h264 @ 0x55d95b718d00] mmco: unref short failure +[h264 @ 0x55d95b718d00] mmco: unref short failure + [2024-11-27 13:02:58] iteration 6/ 1000 | consumed samples: 384 | elapsed time per iteration (ms): 78027.0 | throughput per GPU (TFLOP/s/GPU): 98.8 | learning rate: 1.000000E-06 | global batch size: 64 | lm loss: 1.074550E+00 | loss scale: 1.0 | grad norm: 8.121 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x55d959b29f40] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x55d959b29f40] mmco: unref short failure +[h264 @ 0x55d959b29f40] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x55d959b29f40] mmco: unref short failure + [2024-11-27 13:04:36] iteration 7/ 1000 | consumed samples: 448 | elapsed time per iteration (ms): 98050.4 | throughput per GPU (TFLOP/s/GPU): 78.6 | learning rate: 1.166667E-06 | global batch size: 64 | lm loss: 1.004752E+00 | loss scale: 1.0 | grad norm: 5.474 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x555deda7a5c0] mmco: unref short failure +[h264 @ 0x55d956b9c3c0] mmco: unref short failure + [2024-11-27 13:06:20] iteration 8/ 1000 | consumed samples: 512 | elapsed time per iteration (ms): 103726.0 | throughput per GPU (TFLOP/s/GPU): 74.3 | learning rate: 1.333333E-06 | global batch size: 64 | lm loss: 9.457669E-01 | loss scale: 1.0 | grad norm: 3.092 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x55d958e05fc0] mmco: unref short failure +[h264 @ 0x55d958e05fc0] mmco: unref short failure +[h264 @ 0x555dedbd1740] mmco: unref short failure +[h264 @ 0x555dedbd1740] mmco: unref short failure +[h264 @ 0x55d957fe7780] mmco: unref short failure +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x55d957fe7780] mmco: unref short failure +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x55d958e05fc0] mmco: unref short failure +[h264 @ 0x555dedbd1740] mmco: unref short failure +[h264 @ 0x55d958e05fc0] mmco: unref short failure +[h264 @ 0x555dedbd1740] mmco: unref short failure +[h264 @ 0x55d958e05fc0] mmco: unref short failure +[h264 @ 0x555dedbd1740] mmco: unref short failure +[h264 @ 0x55d958e05fc0] mmco: unref short failure +[h264 @ 0x555dedbd1740] mmco: unref short failure +[h264 @ 0x55d958e05fc0] mmco: unref short failure +[h264 @ 0x555dedbd1740] mmco: unref short failure +[h264 @ 0x555dedbd1740] mmco: unref short failure +[h264 @ 0x555dedbd1740] mmco: unref short failure +[h264 @ 0x55d958e05fc0] mmco: unref short failure +[h264 @ 0x55d958e05fc0] mmco: unref short failure + [2024-11-27 13:07:53] iteration 9/ 1000 | consumed samples: 576 | elapsed time per iteration (ms): 93356.3 | throughput per GPU (TFLOP/s/GPU): 82.6 | learning rate: 1.500000E-06 | global batch size: 64 | lm loss: 9.999593E-01 | loss scale: 1.0 | grad norm: 3.454 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x55d956da5240] mmco: unref short failure + [2024-11-27 13:09:33] iteration 10/ 1000 | consumed samples: 640 | elapsed time per iteration (ms): 99857.5 | throughput per GPU (TFLOP/s/GPU): 77.2 | learning rate: 1.666667E-06 | global batch size: 64 | lm loss: 8.829347E-01 | loss scale: 1.0 | grad norm: 1.931 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555deee52400] mmco: unref short failure +[h264 @ 0x555deee52400] mmco: unref short failure +[h264 @ 0x55d957cd6140] mmco: unref short failure +[h264 @ 0x55d957cd6140] mmco: unref short failure +[h264 @ 0x55d957cd6140] mmco: unref short failure +[h264 @ 0x55d957cd6140] mmco: unref short failure +[h264 @ 0x555deee52400] mmco: unref short failure +[h264 @ 0x555deee52400] mmco: unref short failure +[h264 @ 0x555deee52400] mmco: unref short failure +[h264 @ 0x555deee52400] mmco: unref short failure +[h264 @ 0x55d957cd6140] mmco: unref short failure +[h264 @ 0x55d957cd6140] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x55d959b34680] mmco: unref short failure +[h264 @ 0x55d959b34680] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x55d956f36840] mmco: unref short failure +[h264 @ 0x55d956f36840] mmco: unref short failure +[h264 @ 0x555deee52400] mmco: unref short failure +[h264 @ 0x55d95d112a80] mmco: unref short failure +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x55d956da5240] mmco: unref short failure +[h264 @ 0x55d956da5240] mmco: unref short failure +[h264 @ 0x555ded0b5480] mmco: unref short failure +[h264 @ 0x555ded0b5480] mmco: unref short failure +[h264 @ 0x55d956f36840] mmco: unref short failure +[h264 @ 0x55d956f36840] mmco: unref short failure +[h264 @ 0x555ded0b5480] mmco: unref short failure +[h264 @ 0x555ded0b5480] mmco: unref short failure +[h264 @ 0x55d956f36840] mmco: unref short failure +[h264 @ 0x55d956f36840] mmco: unref short failure +[h264 @ 0x555ded0b5480] mmco: unref short failure +[h264 @ 0x555ded0b5480] mmco: unref short failure +[h264 @ 0x55d956f36840] mmco: unref short failure +[h264 @ 0x55d956f36840] mmco: unref short failure +[h264 @ 0x555ded0b5480] mmco: unref short failure +[h264 @ 0x555ded0b5480] mmco: unref short failure +[h264 @ 0x55d956f36840] mmco: unref short failure +[h264 @ 0x55d956f36840] mmco: unref short failure +[h264 @ 0x55d959b34680] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x555df4961e80] mmco: unref short failure +[h264 @ 0x55d95b718d00] mmco: unref short failure +[h264 @ 0x555ded0b5480] mmco: unref short failure +[h264 @ 0x555ded0b5480] mmco: unref short failure +[h264 @ 0x55d956f36840] mmco: unref short failure +[h264 @ 0x55d956f36840] mmco: unref short failure +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x55d957fe7780] mmco: unref short failure +[h264 @ 0x555ded0b5480] mmco: unref short failure +[h264 @ 0x555ded0b5480] mmco: unref short failure +[h264 @ 0x55d956f36840] mmco: unref short failure +[h264 @ 0x55d956f36840] mmco: unref short failure + [2024-11-27 13:11:17] iteration 11/ 1000 | consumed samples: 704 | elapsed time per iteration (ms): 103591.1 | throughput per GPU (TFLOP/s/GPU): 74.4 | learning rate: 1.833333E-06 | global batch size: 64 | lm loss: 1.001601E+00 | loss scale: 1.0 | grad norm: 3.746 | number of skipped iterations: 0 | number of nan iterations: 0 | + [2024-11-27 13:12:47] iteration 12/ 1000 | consumed samples: 768 | elapsed time per iteration (ms): 90256.0 | throughput per GPU (TFLOP/s/GPU): 85.4 | learning rate: 2.000000E-06 | global batch size: 64 | lm loss: 9.547743E-01 | loss scale: 1.0 | grad norm: 2.667 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555df4961e80] mmco: unref short failure +[h264 @ 0x555df4961e80] mmco: unref short failure +[h264 @ 0x55d95b718d00] mmco: unref short failure +[h264 @ 0x55d95b718d00] mmco: unref short failure + [2024-11-27 13:14:15] iteration 13/ 1000 | consumed samples: 832 | elapsed time per iteration (ms): 88156.2 | throughput per GPU (TFLOP/s/GPU): 87.4 | learning rate: 2.166667E-06 | global batch size: 64 | lm loss: 1.024508E+00 | loss scale: 1.0 | grad norm: 9.773 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x55d957fe7780] mmco: unref short failure +[h264 @ 0x55d957fe7780] mmco: unref short failure +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x555df4961e80] mmco: unref short failure +[h264 @ 0x555df4961e80] mmco: unref short failure +[h264 @ 0x55d95b718d00] mmco: unref short failure +[h264 @ 0x55d95b718d00] mmco: unref short failure +[h264 @ 0x55d95b718d00] mmco: unref short failure +[h264 @ 0x555df4961e80] mmco: unref short failure +[h264 @ 0x55d959b34680] mmco: unref short failure +[h264 @ 0x55d959b34680] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure + [2024-11-27 13:15:46] iteration 14/ 1000 | consumed samples: 896 | elapsed time per iteration (ms): 91099.2 | throughput per GPU (TFLOP/s/GPU): 84.6 | learning rate: 2.333333E-06 | global batch size: 64 | lm loss: 8.997112E-01 | loss scale: 1.0 | grad norm: 2.957 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure + [2024-11-27 13:17:08] iteration 15/ 1000 | consumed samples: 960 | elapsed time per iteration (ms): 82038.9 | throughput per GPU (TFLOP/s/GPU): 94.0 | learning rate: 2.500000E-06 | global batch size: 64 | lm loss: 8.934980E-01 | loss scale: 1.0 | grad norm: 6.978 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x55d957fe7780] mmco: unref short failure +[h264 @ 0x55d957fe7780] mmco: unref short failure +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x55d957fe7780] mmco: unref short failure +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x55d957fe7780] mmco: unref short failure +[h264 @ 0x55d957fe7780] mmco: unref short failure +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x55d957fe7780] mmco: unref short failure +[h264 @ 0x55d957fe7780] mmco: unref short failure +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x55d957fe7780] mmco: unref short failure +[h264 @ 0x55d957fe7780] mmco: unref short failure +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x55d957fe7780] mmco: unref short failure +[h264 @ 0x55d957fe7780] mmco: unref short failure +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x55d957fe7780] mmco: unref short failure +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x55d957fe7780] mmco: unref short failure +[h264 @ 0x55d956b88f40] mmco: unref short failure +[h264 @ 0x55d956b88f40] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x55d956b88f40] mmco: unref short failure +[h264 @ 0x55d956b88f40] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x55d956b88f40] mmco: unref short failure +[h264 @ 0x55d956b88f40] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x55d956b88f40] mmco: unref short failure +[h264 @ 0x55d956b88f40] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x55d956b88f40] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x55d957883f80] mmco: unref short failure +[h264 @ 0x55d957883f80] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x55d956b88f40] mmco: unref short failure +[h264 @ 0x55d956b88f40] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x55d956b88f40] mmco: unref short failure +[h264 @ 0x55d956b88f40] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x55d957883f80] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x55d956b88f40] mmco: unref short failure +[h264 @ 0x55d956b88f40] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x55d957883f80] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x55d957883f80] mmco: unref short failure +[h264 @ 0x55d957883f80] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x55d957883f80] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x55d957883f80] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x55d957883f80] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x55d957883f80] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure + [2024-11-27 13:18:54] iteration 16/ 1000 | consumed samples: 1024 | elapsed time per iteration (ms): 106026.4 | throughput per GPU (TFLOP/s/GPU): 72.7 | learning rate: 2.666667E-06 | global batch size: 64 | lm loss: 7.564893E-01 | loss scale: 1.0 | grad norm: 1.403 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x55d95b718d00] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x555dee700e80] mmco: unref short failure +[h264 @ 0x555dee700e80] mmco: unref short failure +[h264 @ 0x55d957fe7780] mmco: unref short failure +[h264 @ 0x555dee104680] mmco: unref short failure +[h264 @ 0x55d95b718d00] mmco: unref short failure +[h264 @ 0x55d95b718d00] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x55d95b718d00] mmco: unref short failure +[h264 @ 0x55d95b718d00] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x55d956b88f40] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x55d956b88f40] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x55d956b88f40] mmco: unref short failure + [2024-11-27 13:20:39] iteration 17/ 1000 | consumed samples: 1088 | elapsed time per iteration (ms): 104400.9 | throughput per GPU (TFLOP/s/GPU): 73.8 | learning rate: 2.833333E-06 | global batch size: 64 | lm loss: 7.983471E-01 | loss scale: 1.0 | grad norm: 1.099 | number of skipped iterations: 0 | number of nan iterations: 0 | + [2024-11-27 13:22:03] iteration 18/ 1000 | consumed samples: 1152 | elapsed time per iteration (ms): 84307.6 | throughput per GPU (TFLOP/s/GPU): 91.4 | learning rate: 3.000000E-06 | global batch size: 64 | lm loss: 8.053264E-01 | loss scale: 1.0 | grad norm: 1.238 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x55d955e4acc0] mmco: unref short failure +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x55d955e4acc0] mmco: unref short failure + [2024-11-27 13:24:02] iteration 19/ 1000 | consumed samples: 1216 | elapsed time per iteration (ms): 118484.8 | throughput per GPU (TFLOP/s/GPU): 65.1 | learning rate: 3.166667E-06 | global batch size: 64 | lm loss: 7.831764E-01 | loss scale: 1.0 | grad norm: 1.214 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x55d955e4acc0] mmco: unref short failure +[h264 @ 0x55d955e4acc0] mmco: unref short failure +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x55d955e4acc0] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x55d955e4acc0] mmco: unref short failure +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x55d955e4acc0] mmco: unref short failure +[h264 @ 0x55d955e4acc0] mmco: unref short failure +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x55d955e4acc0] mmco: unref short failure +[h264 @ 0x55d955e4acc0] mmco: unref short failure +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x55d955e4acc0] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x555dee700e80] mmco: unref short failure +[h264 @ 0x555dee700e80] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x555dee700e80] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x555dee700e80] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x555dee700e80] mmco: unref short failure +[h264 @ 0x555dee700e80] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x555dee700e80] mmco: unref short failure + [2024-11-27 13:25:59] iteration 20/ 1000 | consumed samples: 1280 | elapsed time per iteration (ms): 117173.2 | throughput per GPU (TFLOP/s/GPU): 65.8 | learning rate: 3.333333E-06 | global batch size: 64 | lm loss: 7.992147E-01 | loss scale: 1.0 | grad norm: 1.085 | number of skipped iterations: 0 | number of nan iterations: 0 | + [2024-11-27 13:27:36] iteration 21/ 1000 | consumed samples: 1344 | elapsed time per iteration (ms): 97623.0 | throughput per GPU (TFLOP/s/GPU): 79.0 | learning rate: 3.500000E-06 | global batch size: 64 | lm loss: 8.462799E-01 | loss scale: 1.0 | grad norm: 1.922 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x555dee700e80] mmco: unref short failure +[h264 @ 0x555dee700e80] mmco: unref short failure +[h264 @ 0x555df4961e80] mmco: unref short failure +[h264 @ 0x55d9569edbc0] mmco: unref short failure +[h264 @ 0x555dee0f8a80] mmco: unref short failure +[h264 @ 0x555dee0f8a80] mmco: unref short failure +[h264 @ 0x55d956b9c3c0] mmco: unref short failure +[h264 @ 0x55d956b9c3c0] mmco: unref short failure +[h264 @ 0x555dee700e80] mmco: unref short failure +[h264 @ 0x555dee700e80] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x555dee700e80] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure + [2024-11-27 13:29:03] iteration 22/ 1000 | consumed samples: 1408 | elapsed time per iteration (ms): 87114.5 | throughput per GPU (TFLOP/s/GPU): 88.5 | learning rate: 3.666667E-06 | global batch size: 64 | lm loss: 7.865314E-01 | loss scale: 1.0 | grad norm: 1.322 | number of skipped iterations: 0 | number of nan iterations: 0 | + [2024-11-27 13:30:21] iteration 23/ 1000 | consumed samples: 1472 | elapsed time per iteration (ms): 77282.9 | throughput per GPU (TFLOP/s/GPU): 99.7 | learning rate: 3.833333E-06 | global batch size: 64 | lm loss: 7.984553E-01 | loss scale: 1.0 | grad norm: 1.140 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure + [2024-11-27 13:31:44] iteration 24/ 1000 | consumed samples: 1536 | elapsed time per iteration (ms): 83399.5 | throughput per GPU (TFLOP/s/GPU): 92.4 | learning rate: 4.000000E-06 | global batch size: 64 | lm loss: 7.098362E-01 | loss scale: 1.0 | grad norm: 1.013 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555dee104680] mmco: unref short failure +[h264 @ 0x55d957fe7780] mmco: unref short failure +[h264 @ 0x555dee104680] mmco: unref short failure +[h264 @ 0x55d957fe7780] mmco: unref short failure + [2024-11-27 13:32:54] iteration 25/ 1000 | consumed samples: 1600 | elapsed time per iteration (ms): 70320.5 | throughput per GPU (TFLOP/s/GPU): 109.6 | learning rate: 4.166667E-06 | global batch size: 64 | lm loss: 7.817208E-01 | loss scale: 1.0 | grad norm: 1.044 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555dee570b40] mmco: unref short failure +[h264 @ 0x555dee570b40] mmco: unref short failure +[h264 @ 0x55d956b88f40] mmco: unref short failure +[h264 @ 0x55d956b88f40] mmco: unref short failure +[h264 @ 0x555df32c1f40] mmco: unref short failure +[h264 @ 0x55d959a05200] mmco: unref short failure +[h264 @ 0x555df32c1f40] mmco: unref short failure +[h264 @ 0x55d959a05200] mmco: unref short failure +[h264 @ 0x555df32c1f40] mmco: unref short failure +[h264 @ 0x55d959a05200] mmco: unref short failure + [2024-11-27 13:34:26] iteration 26/ 1000 | consumed samples: 1664 | elapsed time per iteration (ms): 91962.2 | throughput per GPU (TFLOP/s/GPU): 83.8 | learning rate: 4.333333E-06 | global batch size: 64 | lm loss: 7.768258E-01 | loss scale: 1.0 | grad norm: 1.056 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x55d959cd93c0] mmco: unref short failure +[h264 @ 0x555df0f7cd00] mmco: unref short failure +[h264 @ 0x55d959cd93c0] mmco: unref short failure +[h264 @ 0x555df0f7cd00] mmco: unref short failure +[h264 @ 0x555dee0f8a80] mmco: unref short failure +[h264 @ 0x555dee0f8a80] mmco: unref short failure +[h264 @ 0x555dee0f8a80] mmco: unref short failure +[h264 @ 0x55d956ee5780] mmco: unref short failure +[h264 @ 0x55d956ee5780] mmco: unref short failure +[h264 @ 0x55d956ee5780] mmco: unref short failure + [2024-11-27 13:36:13] iteration 27/ 1000 | consumed samples: 1728 | elapsed time per iteration (ms): 106582.5 | throughput per GPU (TFLOP/s/GPU): 72.3 | learning rate: 4.500000E-06 | global batch size: 64 | lm loss: 7.915913E-01 | loss scale: 1.0 | grad norm: 1.039 | number of skipped iterations: 0 | number of nan iterations: 0 | + [2024-11-27 13:38:00] iteration 28/ 1000 | consumed samples: 1792 | elapsed time per iteration (ms): 107384.1 | throughput per GPU (TFLOP/s/GPU): 71.8 | learning rate: 4.666667E-06 | global batch size: 64 | lm loss: 8.112209E-01 | loss scale: 1.0 | grad norm: 1.072 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555dee104680] mmco: unref short failure +[h264 @ 0x55d957fe7780] mmco: unref short failure +[h264 @ 0x555dee700e80] mmco: unref short failure +[h264 @ 0x555dee700e80] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x555dee700e80] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x555deee52400] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x555deee52400] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x555deee52400] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x555deee52400] mmco: unref short failure +[h264 @ 0x555deee52400] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x555df0f7cd00] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x555deee52400] mmco: unref short failure +[h264 @ 0x555deee52400] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x555df32c1f40] mmco: unref short failure +[h264 @ 0x55d959911280] mmco: unref short failure +[h264 @ 0x555df32c1f40] mmco: unref short failure +[h264 @ 0x555df32c1f40] mmco: unref short failure +[h264 @ 0x55d959911280] mmco: unref short failure +[h264 @ 0x55d959911280] mmco: unref short failure + [2024-11-27 13:39:28] iteration 29/ 1000 | consumed samples: 1856 | elapsed time per iteration (ms): 87109.0 | throughput per GPU (TFLOP/s/GPU): 88.5 | learning rate: 4.833333E-06 | global batch size: 64 | lm loss: 7.587413E-01 | loss scale: 1.0 | grad norm: 0.839 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x555dee570b40] mmco: unref short failure +[h264 @ 0x55d956b88f40] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure + [2024-11-27 13:41:04] iteration 30/ 1000 | consumed samples: 1920 | elapsed time per iteration (ms): 96509.7 | throughput per GPU (TFLOP/s/GPU): 79.9 | learning rate: 5.000000E-06 | global batch size: 64 | lm loss: 7.194266E-01 | loss scale: 1.0 | grad norm: 0.857 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555dee700e80] mmco: unref short failure +[h264 @ 0x555dee700e80] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure + [2024-11-27 13:42:18] iteration 31/ 1000 | consumed samples: 1984 | elapsed time per iteration (ms): 74196.3 | throughput per GPU (TFLOP/s/GPU): 103.9 | learning rate: 4.999987E-06 | global batch size: 64 | lm loss: 7.742970E-01 | loss scale: 1.0 | grad norm: 0.987 | number of skipped iterations: 0 | number of nan iterations: 0 | + [2024-11-27 13:43:48] iteration 32/ 1000 | consumed samples: 2048 | elapsed time per iteration (ms): 90021.2 | throughput per GPU (TFLOP/s/GPU): 85.6 | learning rate: 4.999949E-06 | global batch size: 64 | lm loss: 8.077890E-01 | loss scale: 1.0 | grad norm: 1.652 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555dece38b80] mmco: unref short failure +[h264 @ 0x555dece38b80] mmco: unref short failure +[h264 @ 0x555dece38b80] mmco: unref short failure +[h264 @ 0x555dece38b80] mmco: unref short failure +[h264 @ 0x555deee52400] mmco: unref short failure +[h264 @ 0x555deee52400] mmco: unref short failure +[h264 @ 0x55d956f7b580] mmco: unref short failure +[h264 @ 0x55d956f7b580] mmco: unref short failure +[h264 @ 0x55d956f7b580] mmco: unref short failure +[h264 @ 0x55d956f7b580] mmco: unref short failure +[h264 @ 0x555dece38b80] mmco: unref short failure +[h264 @ 0x55d957a03080] mmco: unref short failure +[h264 @ 0x55d957a03080] mmco: unref short failure +[h264 @ 0x555dece38b80] mmco: unref short failure +[h264 @ 0x55d956f7b580] mmco: unref short failure +[h264 @ 0x55d956f7b580] mmco: unref short failure +[h264 @ 0x555deee52400] mmco: unref short failure +[h264 @ 0x555deee52400] mmco: unref short failure +[h264 @ 0x55d955ffc880] mmco: unref short failure +[h264 @ 0x55d955ffc880] mmco: unref short failure +[h264 @ 0x555dec5a5140] Missing reference picture, default is 65530 +[h264 @ 0x555dec5a5140] Missing reference picture, default is 65530 +[h264 @ 0x555dec5a5140] mmco: unref short failure +[h264 @ 0x555dec5a5140] mmco: unref short failure +[h264 @ 0x555dec5a5140] Missing reference picture, default is 65530 +[h264 @ 0x555dec5a5140] Missing reference picture, default is 65530 +[h264 @ 0x555dec5a5140] mmco: unref short failure +[h264 @ 0x555dec5a5140] mmco: unref short failure +[h264 @ 0x55d9569edbc0] Missing reference picture, default is 65530 +[h264 @ 0x55d9569edbc0] Missing reference picture, default is 65530 +[h264 @ 0x55d9569edbc0] mmco: unref short failure +[h264 @ 0x55d9569edbc0] mmco: unref short failure +[h264 @ 0x55d9569edbc0] Missing reference picture, default is 65530 +[h264 @ 0x55d9569edbc0] Missing reference picture, default is 65530 +[h264 @ 0x55d9569edbc0] mmco: unref short failure +[h264 @ 0x55d9569edbc0] mmco: unref short failure +[h264 @ 0x555dece38b80] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x55d956f7b580] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure + [2024-11-27 13:45:32] iteration 33/ 1000 | consumed samples: 2112 | elapsed time per iteration (ms): 103808.1 | throughput per GPU (TFLOP/s/GPU): 74.3 | learning rate: 4.999884E-06 | global batch size: 64 | lm loss: 7.570043E-01 | loss scale: 1.0 | grad norm: 0.894 | number of skipped iterations: 0 | number of nan iterations: 0 | + [2024-11-27 13:47:09] iteration 34/ 1000 | consumed samples: 2176 | elapsed time per iteration (ms): 96719.9 | throughput per GPU (TFLOP/s/GPU): 79.7 | learning rate: 4.999794E-06 | global batch size: 64 | lm loss: 7.298007E-01 | loss scale: 1.0 | grad norm: 0.830 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x555deee52400] mmco: unref short failure +[h264 @ 0x55d955ffc880] mmco: unref short failure +[h264 @ 0x555deee52400] mmco: unref short failure +[h264 @ 0x555deee52400] mmco: unref short failure +[h264 @ 0x555deee52400] mmco: unref short failure +[h264 @ 0x555deee52400] mmco: unref short failure +[h264 @ 0x55d955ffc880] mmco: unref short failure +[h264 @ 0x55d955ffc880] mmco: unref short failure +[h264 @ 0x55d955ffc880] mmco: unref short failure +[h264 @ 0x55d955ffc880] mmco: unref short failure +[h264 @ 0x555dedfc6580] mmco: unref short failure +[h264 @ 0x555dedfc6580] mmco: unref short failure +[h264 @ 0x55d9568d0900] mmco: unref short failure +[h264 @ 0x55d9568d0900] mmco: unref short failure +[h264 @ 0x555dedfc6580] mmco: unref short failure +[h264 @ 0x55d9568d0900] mmco: unref short failure + [2024-11-27 13:48:35] iteration 35/ 1000 | consumed samples: 2240 | elapsed time per iteration (ms): 85660.6 | throughput per GPU (TFLOP/s/GPU): 90.0 | learning rate: 4.999679E-06 | global batch size: 64 | lm loss: 7.571429E-01 | loss scale: 1.0 | grad norm: 0.834 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555df1cb0600] mmco: unref short failure +[h264 @ 0x555df1cb0600] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x555df1cb0600] Missing reference picture, default is 65530 +[h264 @ 0x555df1cb0600] Missing reference picture, default is 65530 +[h264 @ 0x555df1cb0600] mmco: unref short failure +[h264 @ 0x555df1cb0600] mmco: unref short failure +[h264 @ 0x555df1cb0600] Missing reference picture, default is 65530 +[h264 @ 0x555df1cb0600] Missing reference picture, default is 65530 +[h264 @ 0x555df1cb0600] mmco: unref short failure +[h264 @ 0x555df1cb0600] mmco: unref short failure +[h264 @ 0x55d957fc08c0] Missing reference picture, default is 65530 +[h264 @ 0x55d957fc08c0] Missing reference picture, default is 65530 +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x55d957fc08c0] Missing reference picture, default is 65530 +[h264 @ 0x55d957fc08c0] Missing reference picture, default is 65530 +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x555df1cb0600] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x555df1cb0600] mmco: unref short failure +[h264 @ 0x555df1cb0600] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x555df1cb0600] mmco: unref short failure +[h264 @ 0x555df1cb0600] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x555df1cb0600] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x555df1cb0600] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x555df1cb0600] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x555dec5a5140] mmco: unref short failure +[h264 @ 0x555dec5a5140] mmco: unref short failure +[h264 @ 0x555dec5a5140] mmco: unref short failure +[h264 @ 0x55d95c76c1c0] mmco: unref short failure +[h264 @ 0x55d95c76c1c0] mmco: unref short failure +[h264 @ 0x55d95c76c1c0] mmco: unref short failure +[h264 @ 0x555decea5e80] mmco: unref short failure +[h264 @ 0x555decea5e80] mmco: unref short failure +[h264 @ 0x55d956d792c0] mmco: unref short failure +[h264 @ 0x55d956d792c0] mmco: unref short failure + [2024-11-27 13:50:08] iteration 36/ 1000 | consumed samples: 2304 | elapsed time per iteration (ms): 93714.6 | throughput per GPU (TFLOP/s/GPU): 82.3 | learning rate: 4.999537E-06 | global batch size: 64 | lm loss: 7.604092E-01 | loss scale: 1.0 | grad norm: 1.009 | number of skipped iterations: 0 | number of nan iterations: 0 | + [2024-11-27 13:51:33] iteration 37/ 1000 | consumed samples: 2368 | elapsed time per iteration (ms): 84669.5 | throughput per GPU (TFLOP/s/GPU): 91.0 | learning rate: 4.999370E-06 | global batch size: 64 | lm loss: 7.159459E-01 | loss scale: 1.0 | grad norm: 0.879 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555dee588f00] mmco: unref short failure +[h264 @ 0x555dee588f00] mmco: unref short failure +[h264 @ 0x55d956b9c3c0] mmco: unref short failure +[h264 @ 0x55d956b9c3c0] mmco: unref short failure +[h264 @ 0x55d956b9c3c0] mmco: unref short failure +[h264 @ 0x555dee588f00] mmco: unref short failure +[h264 @ 0x555dedb7a200] mmco: unref short failure +[h264 @ 0x555dedb7a200] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x555dec5a5140] mmco: unref short failure +[h264 @ 0x555dec5a5140] mmco: unref short failure +[h264 @ 0x55d95c76c1c0] mmco: unref short failure +[h264 @ 0x55d95c76c1c0] mmco: unref short failure +[h264 @ 0x555dee570b40] mmco: unref short failure +[h264 @ 0x55d956b88f40] mmco: unref short failure + [2024-11-27 13:53:03] iteration 38/ 1000 | consumed samples: 2432 | elapsed time per iteration (ms): 90181.2 | throughput per GPU (TFLOP/s/GPU): 85.5 | learning rate: 4.999178E-06 | global batch size: 64 | lm loss: 7.358369E-01 | loss scale: 1.0 | grad norm: 1.060 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555df32c1f40] mmco: unref short failure +[h264 @ 0x555ded70ae00] mmco: unref short failure +[h264 @ 0x555ded70ae00] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x555ded70ae00] mmco: unref short failure +[h264 @ 0x555ded70ae00] mmco: unref short failure +[h264 @ 0x555ded70ae00] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure + [2024-11-27 13:54:22] iteration 39/ 1000 | consumed samples: 2496 | elapsed time per iteration (ms): 79226.7 | throughput per GPU (TFLOP/s/GPU): 97.3 | learning rate: 4.998959E-06 | global batch size: 64 | lm loss: 7.418987E-01 | loss scale: 1.0 | grad norm: 0.799 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555deee52400] mmco: unref short failure +[h264 @ 0x555deee52400] mmco: unref short failure +[h264 @ 0x55d956da5240] mmco: unref short failure +[h264 @ 0x55d956da5240] mmco: unref short failure +[h264 @ 0x555dece38b80] mmco: unref short failure +[h264 @ 0x555dece38b80] mmco: unref short failure +[h264 @ 0x55d959a05200] mmco: unref short failure +[h264 @ 0x55d959a05200] mmco: unref short failure +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x55d955d4f640] mmco: unref short failure +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x55d955d4f640] mmco: unref short failure + [2024-11-27 13:55:49] iteration 40/ 1000 | consumed samples: 2560 | elapsed time per iteration (ms): 86430.1 | throughput per GPU (TFLOP/s/GPU): 89.2 | learning rate: 4.998715E-06 | global batch size: 64 | lm loss: 7.414553E-01 | loss scale: 1.0 | grad norm: 0.868 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555dee100dc0] mmco: unref short failure +[h264 @ 0x55d95b718d00] mmco: unref short failure +[h264 @ 0x555ded70ae00] mmco: unref short failure +[h264 @ 0x555ded70ae00] mmco: unref short failure +[h264 @ 0x555dee100dc0] mmco: unref short failure +[h264 @ 0x555dee100dc0] mmco: unref short failure +[h264 @ 0x55d95b718d00] mmco: unref short failure +[h264 @ 0x55d95b718d00] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x55d956f36e40] mmco: unref short failure +[h264 @ 0x555deee52400] mmco: unref short failure +[h264 @ 0x55d956f36e40] mmco: unref short failure +[h264 @ 0x55d956f36e40] mmco: unref short failure +[h264 @ 0x555deee52400] mmco: unref short failure +[h264 @ 0x555deee52400] mmco: unref short failure +[h264 @ 0x555ded70ae00] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x55d956f36e40] mmco: unref short failure +[h264 @ 0x555deee52400] mmco: unref short failure +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x55d955d4f640] mmco: unref short failure +[h264 @ 0x55d955d4f640] mmco: unref short failure + [2024-11-27 13:57:24] iteration 41/ 1000 | consumed samples: 2624 | elapsed time per iteration (ms): 95156.8 | throughput per GPU (TFLOP/s/GPU): 81.0 | learning rate: 4.998445E-06 | global batch size: 64 | lm loss: 7.269660E-01 | loss scale: 1.0 | grad norm: 0.875 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555dee570b40] mmco: unref short failure +[h264 @ 0x55d956b88f40] mmco: unref short failure +[h264 @ 0x555dee570b40] mmco: unref short failure +[h264 @ 0x555dee570b40] mmco: unref short failure +[h264 @ 0x55d956b88f40] mmco: unref short failure +[h264 @ 0x55d956b88f40] mmco: unref short failure +[h264 @ 0x555dee570b40] mmco: unref short failure +[h264 @ 0x55d956b88f40] mmco: unref short failure +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x55d957fbfd40] mmco: unref short failure +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x55d957fbfd40] mmco: unref short failure +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x55d957fbfd40] mmco: unref short failure +[h264 @ 0x55d957fbfd40] mmco: unref short failure +[h264 @ 0x55d957fbfd40] mmco: unref short failure +[h264 @ 0x55d957fbfd40] mmco: unref short failure + [2024-11-27 13:58:48] iteration 42/ 1000 | consumed samples: 2688 | elapsed time per iteration (ms): 84230.2 | throughput per GPU (TFLOP/s/GPU): 91.5 | learning rate: 4.998150E-06 | global batch size: 64 | lm loss: 6.718149E-01 | loss scale: 1.0 | grad norm: 0.843 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555dedb7a200] mmco: unref short failure +[h264 @ 0x555dedb7a200] mmco: unref short failure +[h264 @ 0x55d95742d280] mmco: unref short failure +[h264 @ 0x55d95742d280] mmco: unref short failure +[h264 @ 0x555dedb7a200] mmco: unref short failure +[h264 @ 0x55d95742d280] mmco: unref short failure +[h264 @ 0x55d95742d280] mmco: unref short failure +[h264 @ 0x55d95742d280] mmco: unref short failure +[h264 @ 0x555dedb7a200] mmco: unref short failure +[h264 @ 0x555dedb7a200] mmco: unref short failure +[h264 @ 0x55d95742d280] mmco: unref short failure +[h264 @ 0x55d95742d280] mmco: unref short failure +[h264 @ 0x555dedb7a200] mmco: unref short failure +[h264 @ 0x555dedb7a200] mmco: unref short failure +[h264 @ 0x55d95742d280] mmco: unref short failure +[h264 @ 0x55d95742d280] mmco: unref short failure +[h264 @ 0x555dedb7a200] mmco: unref short failure +[h264 @ 0x555dedb7a200] mmco: unref short failure +[h264 @ 0x55d95742d280] mmco: unref short failure +[h264 @ 0x55d95742d280] mmco: unref short failure +[h264 @ 0x55d95742d280] mmco: unref short failure +[h264 @ 0x55d95742d280] mmco: unref short failure +[h264 @ 0x555dedb7a200] mmco: unref short failure +[h264 @ 0x555dedb7a200] mmco: unref short failure +[h264 @ 0x555dedb7a200] mmco: unref short failure +[h264 @ 0x555dedb7a200] mmco: unref short failure +[h264 @ 0x55d95742d280] mmco: unref short failure +[h264 @ 0x55d95742d280] mmco: unref short failure +[h264 @ 0x555dedb7a200] mmco: unref short failure +[h264 @ 0x555dedb7a200] mmco: unref short failure + [2024-11-27 14:00:19] iteration 43/ 1000 | consumed samples: 2752 | elapsed time per iteration (ms): 91001.1 | throughput per GPU (TFLOP/s/GPU): 84.7 | learning rate: 4.997829E-06 | global batch size: 64 | lm loss: 8.040012E-01 | loss scale: 1.0 | grad norm: 1.645 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555dee100dc0] mmco: unref short failure +[h264 @ 0x55d9593709c0] mmco: unref short failure +[h264 @ 0x555dec5a5140] mmco: unref short failure +[h264 @ 0x555dec5a5140] mmco: unref short failure +[h264 @ 0x55d95c76c1c0] mmco: unref short failure +[h264 @ 0x555dec5a5140] mmco: unref short failure +[h264 @ 0x555dec5a5140] mmco: unref short failure +[h264 @ 0x55d95c76c1c0] mmco: unref short failure +[h264 @ 0x55d95c76c1c0] mmco: unref short failure +[h264 @ 0x55d95c76c1c0] mmco: unref short failure +[h264 @ 0x555dec5a5140] mmco: unref short failure +[h264 @ 0x555dec5a5140] mmco: unref short failure +[h264 @ 0x555dee570b40] mmco: unref short failure +[h264 @ 0x555ded70ae00] mmco: unref short failure +[h264 @ 0x555ded70ae00] mmco: unref short failure +[h264 @ 0x555dec5a5140] mmco: unref short failure +[h264 @ 0x555dec5a5140] mmco: unref short failure +[h264 @ 0x55d95c76c1c0] mmco: unref short failure +[h264 @ 0x55d95c76c1c0] mmco: unref short failure +[h264 @ 0x55d956b88f40] mmco: unref short failure +[h264 @ 0x55d95c76c1c0] mmco: unref short failure +[h264 @ 0x55d95c76c1c0] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x555dec5a5140] mmco: unref short failure +[h264 @ 0x555dec5a5140] mmco: unref short failure +[h264 @ 0x55d95c76c1c0] mmco: unref short failure +[h264 @ 0x55d95c76c1c0] mmco: unref short failure +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x555dec5a5140] mmco: unref short failure +[h264 @ 0x555dec5a5140] mmco: unref short failure +[h264 @ 0x555dec5a5140] mmco: unref short failure +[h264 @ 0x55d95c76c1c0] mmco: unref short failure +[h264 @ 0x55d95c76c1c0] mmco: unref short failure +[h264 @ 0x55d95c76c1c0] mmco: unref short failure +[h264 @ 0x555dee570b40] mmco: unref short failure +[h264 @ 0x55d956b88f40] mmco: unref short failure + [2024-11-27 14:01:43] iteration 44/ 1000 | consumed samples: 2816 | elapsed time per iteration (ms): 83601.6 | throughput per GPU (TFLOP/s/GPU): 92.2 | learning rate: 4.997482E-06 | global batch size: 64 | lm loss: 7.258285E-01 | loss scale: 1.0 | grad norm: 0.891 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555dece38b80] mmco: unref short failure +[h264 @ 0x55d959a05200] mmco: unref short failure +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x555dece38b80] mmco: unref short failure +[h264 @ 0x555dece38b80] mmco: unref short failure +[h264 @ 0x55d959a05200] mmco: unref short failure +[h264 @ 0x55d959a05200] mmco: unref short failure + [2024-11-27 14:02:59] iteration 45/ 1000 | consumed samples: 2880 | elapsed time per iteration (ms): 75874.0 | throughput per GPU (TFLOP/s/GPU): 101.6 | learning rate: 4.997109E-06 | global batch size: 64 | lm loss: 7.296727E-01 | loss scale: 1.0 | grad norm: 0.843 | number of skipped iterations: 0 | number of nan iterations: 0 | + [2024-11-27 14:04:26] iteration 46/ 1000 | consumed samples: 2944 | elapsed time per iteration (ms): 87483.0 | throughput per GPU (TFLOP/s/GPU): 88.1 | learning rate: 4.996711E-06 | global batch size: 64 | lm loss: 7.469407E-01 | loss scale: 1.0 | grad norm: 0.989 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555dee588f00] mmco: unref short failure +[h264 @ 0x555dee588f00] mmco: unref short failure +[h264 @ 0x555dee0f8a80] mmco: unref short failure +[h264 @ 0x55d957cd6140] mmco: unref short failure +[h264 @ 0x55d957cd6140] mmco: unref short failure +[h264 @ 0x55d955d24640] mmco: unref short failure +[h264 @ 0x555dee0f8a80] mmco: unref short failure +[h264 @ 0x555dee0f8a80] mmco: unref short failure +[h264 @ 0x555dee0f8a80] mmco: unref short failure +[h264 @ 0x55d955d24640] mmco: unref short failure +[h264 @ 0x55d955d24640] mmco: unref short failure +[h264 @ 0x55d955d24640] mmco: unref short failure + [2024-11-27 14:05:36] iteration 47/ 1000 | consumed samples: 3008 | elapsed time per iteration (ms): 69902.1 | throughput per GPU (TFLOP/s/GPU): 110.3 | learning rate: 4.996287E-06 | global batch size: 64 | lm loss: 6.770863E-01 | loss scale: 1.0 | grad norm: 1.086 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555ded70ae00] mmco: unref short failure +[h264 @ 0x555ded70ae00] mmco: unref short failure +[h264 @ 0x55d955b8bcc0] mmco: unref short failure +[h264 @ 0x55d955b8bcc0] mmco: unref short failure +[h264 @ 0x555ded70ae00] mmco: unref short failure +[h264 @ 0x55d955b8bcc0] mmco: unref short failure +[h264 @ 0x55d955b8bcc0] mmco: unref short failure +[h264 @ 0x555ded70ae00] mmco: unref short failure + [2024-11-27 14:06:49] iteration 48/ 1000 | consumed samples: 3072 | elapsed time per iteration (ms): 73109.8 | throughput per GPU (TFLOP/s/GPU): 105.4 | learning rate: 4.995838E-06 | global batch size: 64 | lm loss: 7.589791E-01 | loss scale: 1.0 | grad norm: 0.894 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555deda7a5c0] mmco: unref short failure +[h264 @ 0x555deda7a5c0] mmco: unref short failure +[h264 @ 0x55d956b88f40] mmco: unref short failure +[h264 @ 0x55d956b88f40] mmco: unref short failure +[h264 @ 0x555deda7a5c0] mmco: unref short failure +[h264 @ 0x555deda7a5c0] mmco: unref short failure +[h264 @ 0x55d956b88f40] mmco: unref short failure +[h264 @ 0x55d956b88f40] mmco: unref short failure +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure + [2024-11-27 14:08:27] iteration 49/ 1000 | consumed samples: 3136 | elapsed time per iteration (ms): 97771.8 | throughput per GPU (TFLOP/s/GPU): 78.8 | learning rate: 4.995363E-06 | global batch size: 64 | lm loss: 7.114014E-01 | loss scale: 1.0 | grad norm: 0.783 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x55d956ee5780] mmco: unref short failure +[h264 @ 0x55d956ee5780] mmco: unref short failure +[h264 @ 0x555dee0f8a80] mmco: unref short failure +[h264 @ 0x555dee0f8a80] mmco: unref short failure +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x555dee792700] mmco: unref short failure +[h264 @ 0x555dee792700] mmco: unref short failure +[h264 @ 0x55d9560aea40] mmco: unref short failure +[h264 @ 0x55d9560aea40] mmco: unref short failure +[h264 @ 0x555dee792700] mmco: unref short failure +[h264 @ 0x55d9560aea40] mmco: unref short failure +[h264 @ 0x555dee792700] mmco: unref short failure +[h264 @ 0x555dee792700] mmco: unref short failure +[h264 @ 0x55d9560aea40] mmco: unref short failure +[h264 @ 0x55d9560aea40] mmco: unref short failure + [2024-11-27 14:10:59] iteration 50/ 1000 | consumed samples: 3200 | elapsed time per iteration (ms): 151487.1 | throughput per GPU (TFLOP/s/GPU): 50.9 | learning rate: 4.994862E-06 | global batch size: 64 | lm loss: 6.831369E-01 | loss scale: 1.0 | grad norm: 0.813 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x55d9560aea40] mmco: unref short failure +[h264 @ 0x55d9560aea40] mmco: unref short failure +[h264 @ 0x555dee0f8a80] mmco: unref short failure +[h264 @ 0x555dee0f8a80] mmco: unref short failure +[h264 @ 0x555df32c1f40] mmco: unref short failure +[h264 @ 0x555df32c1f40] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x555ded70ae00] mmco: unref short failure +[h264 @ 0x55d956f36840] mmco: unref short failure +[h264 @ 0x555ded70ae00] mmco: unref short failure +[h264 @ 0x555ded70ae00] mmco: unref short failure +[h264 @ 0x55d956f36840] mmco: unref short failure +[h264 @ 0x55d956f36840] mmco: unref short failure +[h264 @ 0x555ded70ae00] mmco: unref short failure +[h264 @ 0x55d956f36840] mmco: unref short failure +[h264 @ 0x555df32c1f40] mmco: unref short failure +[h264 @ 0x555ded70ae00] mmco: unref short failure +[h264 @ 0x555ded70ae00] mmco: unref short failure +[h264 @ 0x555df32c1f40] mmco: unref short failure +[h264 @ 0x55d956f36840] mmco: unref short failure +[h264 @ 0x55d956f36840] mmco: unref short failure +[h264 @ 0x555ded70ae00] mmco: unref short failure +[h264 @ 0x55d9581bc7c0] mmco: unref short failure +[h264 @ 0x55d9581bc7c0] mmco: unref short failure +[h264 @ 0x55d956f36840] mmco: unref short failure +[h264 @ 0x555df32c1f40] mmco: unref short failure +[h264 @ 0x555ded70ae00] mmco: unref short failure +[h264 @ 0x55d9581bc7c0] mmco: unref short failure +[h264 @ 0x55d956f36840] mmco: unref short failure +[h264 @ 0x555ded70ae00] mmco: unref short failure +[h264 @ 0x555ded70ae00] mmco: unref short failure +[h264 @ 0x55d956f36840] mmco: unref short failure +[h264 @ 0x55d956f36840] mmco: unref short failure +[h264 @ 0x555ded70ae00] mmco: unref short failure +[h264 @ 0x555ded70ae00] mmco: unref short failure +[h264 @ 0x555ded70ae00] mmco: unref short failure +[h264 @ 0x555ded70ae00] mmco: unref short failure +[h264 @ 0x55d956f36840] mmco: unref short failure +[h264 @ 0x55d956f36840] mmco: unref short failure +[h264 @ 0x55d956f36840] mmco: unref short failure +[h264 @ 0x55d956f36840] mmco: unref short failure +[h264 @ 0x555ded70ae00] mmco: unref short failure +[h264 @ 0x55d956f36840] mmco: unref short failure + [2024-11-27 14:12:58] iteration 51/ 1000 | consumed samples: 3264 | elapsed time per iteration (ms): 119229.3 | throughput per GPU (TFLOP/s/GPU): 64.7 | learning rate: 4.994335E-06 | global batch size: 64 | lm loss: 6.711353E-01 | loss scale: 1.0 | grad norm: 0.795 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555df32c1f40] mmco: unref short failure +[h264 @ 0x55d957fe7780] mmco: unref short failure +[h264 @ 0x555df32c1f40] mmco: unref short failure +[h264 @ 0x55d957fe7780] mmco: unref short failure +[h264 @ 0x555df32c1f40] mmco: unref short failure +[h264 @ 0x555df32c1f40] mmco: unref short failure +[h264 @ 0x55d957fe7780] mmco: unref short failure +[h264 @ 0x55d957fe7780] mmco: unref short failure + [2024-11-27 14:14:21] iteration 52/ 1000 | consumed samples: 3328 | elapsed time per iteration (ms): 82743.2 | throughput per GPU (TFLOP/s/GPU): 93.2 | learning rate: 4.993783E-06 | global batch size: 64 | lm loss: 6.961546E-01 | loss scale: 1.0 | grad norm: 0.688 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure + [2024-11-27 14:15:36] iteration 53/ 1000 | consumed samples: 3392 | elapsed time per iteration (ms): 75412.8 | throughput per GPU (TFLOP/s/GPU): 102.2 | learning rate: 4.993206E-06 | global batch size: 64 | lm loss: 7.831711E-01 | loss scale: 1.0 | grad norm: 0.820 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555ded70ae00] mmco: unref short failure +[h264 @ 0x55d956f36840] mmco: unref short failure +[h264 @ 0x555df32c1f40] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x555df32c1f40] mmco: unref short failure +[h264 @ 0x555df32c1f40] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x555dec5a5140] mmco: unref short failure +[h264 @ 0x555dec5a5140] mmco: unref short failure +[h264 @ 0x55d957fbfd40] mmco: unref short failure +[h264 @ 0x55d957fbfd40] mmco: unref short failure +[h264 @ 0x555dec5a5140] mmco: unref short failure +[h264 @ 0x55d957fbfd40] mmco: unref short failure + [2024-11-27 14:17:03] iteration 54/ 1000 | consumed samples: 3456 | elapsed time per iteration (ms): 86695.8 | throughput per GPU (TFLOP/s/GPU): 88.9 | learning rate: 4.992602E-06 | global batch size: 64 | lm loss: 7.702816E-01 | loss scale: 1.0 | grad norm: 1.704 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x555ded63c840] mmco: unref short failure +[h264 @ 0x555ded63c840] mmco: unref short failure +[h264 @ 0x555ded63c840] mmco: unref short failure +[h264 @ 0x555ded63c840] mmco: unref short failure +[h264 @ 0x555ded63c840] mmco: unref short failure +[h264 @ 0x55d957fe7780] mmco: unref short failure +[h264 @ 0x55d957fe7780] mmco: unref short failure +[h264 @ 0x55d957fe7780] mmco: unref short failure +[h264 @ 0x555dedbd1740] mmco: unref short failure +[h264 @ 0x555dedbd1740] mmco: unref short failure +[h264 @ 0x55d957fe7780] mmco: unref short failure +[h264 @ 0x55d957fe7780] mmco: unref short failure +[h264 @ 0x55d956f78600] mmco: unref short failure +[h264 @ 0x55d956f78600] mmco: unref short failure +[h264 @ 0x555dedbd1740] mmco: unref short failure +[h264 @ 0x55d956f78600] mmco: unref short failure + [2024-11-27 14:18:34] iteration 55/ 1000 | consumed samples: 3520 | elapsed time per iteration (ms): 90982.9 | throughput per GPU (TFLOP/s/GPU): 84.7 | learning rate: 4.991973E-06 | global batch size: 64 | lm loss: 7.983244E-01 | loss scale: 1.0 | grad norm: 0.989 | number of skipped iterations: 0 | number of nan iterations: 0 | + [2024-11-27 14:20:14] iteration 56/ 1000 | consumed samples: 3584 | elapsed time per iteration (ms): 100724.3 | throughput per GPU (TFLOP/s/GPU): 76.5 | learning rate: 4.991319E-06 | global batch size: 64 | lm loss: 6.829706E-01 | loss scale: 1.0 | grad norm: 0.817 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555dee100dc0] mmco: unref short failure +[h264 @ 0x555dee100dc0] mmco: unref short failure +[h264 @ 0x55d958e05fc0] mmco: unref short failure +[h264 @ 0x55d958e05fc0] mmco: unref short failure + [2024-11-27 14:21:37] iteration 57/ 1000 | consumed samples: 3648 | elapsed time per iteration (ms): 82850.4 | throughput per GPU (TFLOP/s/GPU): 93.0 | learning rate: 4.990639E-06 | global batch size: 64 | lm loss: 6.990687E-01 | loss scale: 1.0 | grad norm: 0.758 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555dec5a5140] mmco: unref short failure +[h264 @ 0x555dec5a5140] mmco: unref short failure +[h264 @ 0x55d957883f80] mmco: unref short failure +[h264 @ 0x55d957883f80] mmco: unref short failure +[h264 @ 0x555dee100dc0] mmco: unref short failure +[h264 @ 0x555dee100dc0] mmco: unref short failure +[h264 @ 0x55d956f78600] mmco: unref short failure +[h264 @ 0x55d956f78600] mmco: unref short failure + [2024-11-27 14:23:03] iteration 58/ 1000 | consumed samples: 3712 | elapsed time per iteration (ms): 85987.9 | throughput per GPU (TFLOP/s/GPU): 89.6 | learning rate: 4.989933E-06 | global batch size: 64 | lm loss: 7.449348E-01 | loss scale: 1.0 | grad norm: 0.772 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555ded679600] mmco: unref short failure +[h264 @ 0x55d95707f900] mmco: unref short failure +[h264 @ 0x555ded679600] mmco: unref short failure +[h264 @ 0x55d95707f900] mmco: unref short failure +[h264 @ 0x555ded679600] mmco: unref short failure +[h264 @ 0x555ded679600] mmco: unref short failure +[h264 @ 0x555ded679600] mmco: unref short failure +[h264 @ 0x55d95707f900] mmco: unref short failure +[h264 @ 0x55d95707f900] mmco: unref short failure +[h264 @ 0x55d95707f900] mmco: unref short failure +[h264 @ 0x55d956f78600] mmco: unref short failure +[h264 @ 0x55d956f78600] mmco: unref short failure +[h264 @ 0x555dee100dc0] mmco: unref short failure +[h264 @ 0x555dee100dc0] mmco: unref short failure +[h264 @ 0x55d957883f80] mmco: unref short failure +[h264 @ 0x55d957883f80] mmco: unref short failure +[h264 @ 0x55d956f78600] mmco: unref short failure +[h264 @ 0x555dec5a5140] mmco: unref short failure +[h264 @ 0x555dec5a5140] mmco: unref short failure +[h264 @ 0x555dee100dc0] mmco: unref short failure +[h264 @ 0x55d956f78600] mmco: unref short failure +[h264 @ 0x55d956f78600] mmco: unref short failure +[h264 @ 0x555dee100dc0] mmco: unref short failure +[h264 @ 0x555dee100dc0] mmco: unref short failure +[h264 @ 0x55d956f78600] mmco: unref short failure +[h264 @ 0x55d956f78600] mmco: unref short failure +[h264 @ 0x555dee100dc0] mmco: unref short failure +[h264 @ 0x555dee100dc0] mmco: unref short failure +[h264 @ 0x55d956f78600] mmco: unref short failure +[h264 @ 0x55d956f78600] mmco: unref short failure +[h264 @ 0x555dee100dc0] mmco: unref short failure +[h264 @ 0x555dee100dc0] mmco: unref short failure + [2024-11-27 14:24:15] iteration 59/ 1000 | consumed samples: 3776 | elapsed time per iteration (ms): 71954.2 | throughput per GPU (TFLOP/s/GPU): 107.1 | learning rate: 4.989201E-06 | global batch size: 64 | lm loss: 7.466602E-01 | loss scale: 1.0 | grad norm: 0.959 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555dec5a5140] mmco: unref short failure +[h264 @ 0x55d957883f80] mmco: unref short failure +[h264 @ 0x555dec5a5140] mmco: unref short failure +[h264 @ 0x555dec5a5140] mmco: unref short failure +[h264 @ 0x55d957883f80] mmco: unref short failure +[h264 @ 0x55d957883f80] mmco: unref short failure +[h264 @ 0x555dec5a5140] mmco: unref short failure +[h264 @ 0x555dec5a5140] mmco: unref short failure +[h264 @ 0x55d957883f80] mmco: unref short failure +[h264 @ 0x55d957883f80] mmco: unref short failure +[h264 @ 0x555dec5a5140] mmco: unref short failure +[h264 @ 0x55d957883f80] mmco: unref short failure +[h264 @ 0x555ded679600] mmco: unref short failure +[h264 @ 0x55d95707f900] mmco: unref short failure + [2024-11-27 14:25:53] iteration 60/ 1000 | consumed samples: 3840 | elapsed time per iteration (ms): 97724.8 | throughput per GPU (TFLOP/s/GPU): 78.9 | learning rate: 4.988444E-06 | global batch size: 64 | lm loss: 7.879194E-01 | loss scale: 1.0 | grad norm: 0.734 | number of skipped iterations: 0 | number of nan iterations: 0 | + [2024-11-27 14:27:15] iteration 61/ 1000 | consumed samples: 3904 | elapsed time per iteration (ms): 82152.3 | throughput per GPU (TFLOP/s/GPU): 93.8 | learning rate: 4.987662E-06 | global batch size: 64 | lm loss: 7.262596E-01 | loss scale: 1.0 | grad norm: 0.821 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x55d956f7b580] mmco: unref short failure +[h264 @ 0x555dede47780] mmco: unref short failure +[h264 @ 0x55d957883f80] mmco: unref short failure +[h264 @ 0x55d957883f80] mmco: unref short failure +[h264 @ 0x555dec5a5140] mmco: unref short failure +[h264 @ 0x555dec5a5140] mmco: unref short failure +[h264 @ 0x555ded63c840] mmco: unref short failure +[h264 @ 0x55d957fe7780] mmco: unref short failure + [2024-11-27 14:28:32] iteration 62/ 1000 | consumed samples: 3968 | elapsed time per iteration (ms): 76831.2 | throughput per GPU (TFLOP/s/GPU): 100.3 | learning rate: 4.986854E-06 | global batch size: 64 | lm loss: 7.915837E-01 | loss scale: 1.0 | grad norm: 0.811 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555dec5a5140] mmco: unref short failure +[h264 @ 0x555dec5a5140] mmco: unref short failure +[h264 @ 0x55d957883f80] mmco: unref short failure +[h264 @ 0x55d957883f80] mmco: unref short failure +[h264 @ 0x555dec5a5140] mmco: unref short failure +[h264 @ 0x55d957883f80] mmco: unref short failure +[h264 @ 0x555def251b40] mmco: unref short failure +[h264 @ 0x555def251b40] mmco: unref short failure +[h264 @ 0x55d956ee5780] mmco: unref short failure +[h264 @ 0x55d956ee5780] mmco: unref short failure + [2024-11-27 14:30:03] iteration 63/ 1000 | consumed samples: 4032 | elapsed time per iteration (ms): 91242.8 | throughput per GPU (TFLOP/s/GPU): 84.5 | learning rate: 4.986020E-06 | global batch size: 64 | lm loss: 9.017408E-01 | loss scale: 1.0 | grad norm: 0.846 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555def251b40] mmco: unref short failure +[h264 @ 0x55d956f36840] mmco: unref short failure +[h264 @ 0x555df32c1f40] mmco: unref short failure +[h264 @ 0x555df32c1f40] mmco: unref short failure +[h264 @ 0x55d957a1cec0] mmco: unref short failure +[h264 @ 0x55d957a1cec0] mmco: unref short failure +[h264 @ 0x555df32c1f40] mmco: unref short failure +[h264 @ 0x555df32c1f40] mmco: unref short failure +[h264 @ 0x55d957a1cec0] mmco: unref short failure +[h264 @ 0x55d957a1cec0] mmco: unref short failure +[h264 @ 0x555dee100dc0] mmco: unref short failure +[h264 @ 0x55d956f78600] mmco: unref short failure +[h264 @ 0x555dee100dc0] mmco: unref short failure +[h264 @ 0x555dee100dc0] mmco: unref short failure +[h264 @ 0x55d956f78600] mmco: unref short failure +[h264 @ 0x55d956f78600] mmco: unref short failure +[h264 @ 0x555df32c1f40] mmco: unref short failure +[h264 @ 0x55d957a1cec0] mmco: unref short failure +[h264 @ 0x555dee100dc0] mmco: unref short failure +[h264 @ 0x555dee100dc0] mmco: unref short failure +[h264 @ 0x55d956f78600] mmco: unref short failure +[h264 @ 0x55d956f78600] mmco: unref short failure + [2024-11-27 14:31:13] iteration 64/ 1000 | consumed samples: 4096 | elapsed time per iteration (ms): 70065.0 | throughput per GPU (TFLOP/s/GPU): 110.0 | learning rate: 4.985161E-06 | global batch size: 64 | lm loss: 7.884458E-01 | loss scale: 1.0 | grad norm: 0.670 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555df1cb0600] mmco: unref short failure +[h264 @ 0x55d9568d0900] mmco: unref short failure +[h264 @ 0x555df1cb0600] mmco: unref short failure +[h264 @ 0x55d9568d0900] mmco: unref short failure + [2024-11-27 14:33:07] iteration 65/ 1000 | consumed samples: 4160 | elapsed time per iteration (ms): 113360.7 | throughput per GPU (TFLOP/s/GPU): 68.0 | learning rate: 4.984276E-06 | global batch size: 64 | lm loss: 7.222143E-01 | loss scale: 1.0 | grad norm: 0.766 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555defa21280] mmco: unref short failure +[h264 @ 0x555defa21280] mmco: unref short failure +[h264 @ 0x55d95d112a80] mmco: unref short failure +[h264 @ 0x55d95d112a80] mmco: unref short failure +[h264 @ 0x555defa21280] mmco: unref short failure +[h264 @ 0x555defa21280] mmco: unref short failure +[h264 @ 0x55d95d112a80] mmco: unref short failure +[h264 @ 0x55d95d112a80] mmco: unref short failure +[h264 @ 0x555defa21280] mmco: unref short failure +[h264 @ 0x555defa21280] mmco: unref short failure +[h264 @ 0x55d95d112a80] mmco: unref short failure +[h264 @ 0x55d95d112a80] mmco: unref short failure +[h264 @ 0x55d956f78600] mmco: unref short failure +[h264 @ 0x55d956f78600] mmco: unref short failure +[h264 @ 0x555dee100dc0] mmco: unref short failure +[h264 @ 0x555dee100dc0] mmco: unref short failure +[h264 @ 0x55d957a03080] mmco: unref short failure +[h264 @ 0x55d957a03080] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x55d957a03080] mmco: unref short failure +[h264 @ 0x55d957a03080] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x55d957a03080] mmco: unref short failure +[h264 @ 0x55d957a03080] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x55d957a03080] mmco: unref short failure +[h264 @ 0x55d957a03080] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x55d957a03080] mmco: unref short failure +[h264 @ 0x55d957a03080] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x55d957a03080] mmco: unref short failure +[h264 @ 0x55d957a03080] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x55d957a03080] mmco: unref short failure +[h264 @ 0x55d957a03080] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure + [2024-11-27 14:34:30] iteration 66/ 1000 | consumed samples: 4224 | elapsed time per iteration (ms): 83724.4 | throughput per GPU (TFLOP/s/GPU): 92.1 | learning rate: 4.983366E-06 | global batch size: 64 | lm loss: 7.112570E-01 | loss scale: 1.0 | grad norm: 0.843 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555defa21280] mmco: unref short failure +[h264 @ 0x555defa21280] mmco: unref short failure +[h264 @ 0x55d95d112a80] mmco: unref short failure +[h264 @ 0x55d95d112a80] mmco: unref short failure +[h264 @ 0x555df32c1f40] mmco: unref short failure +[h264 @ 0x555df32c1f40] mmco: unref short failure +[h264 @ 0x55d95678a600] mmco: unref short failure +[h264 @ 0x55d95678a600] mmco: unref short failure +[h264 @ 0x555df32c1f40] mmco: unref short failure +[h264 @ 0x55d95678a600] mmco: unref short failure +[h264 @ 0x555df32c1f40] mmco: unref short failure +[h264 @ 0x55d95678a600] mmco: unref short failure +[h264 @ 0x555df32c1f40] mmco: unref short failure +[h264 @ 0x55d95742d280] mmco: unref short failure +[h264 @ 0x55d95873d100] mmco: unref short failure +[h264 @ 0x555ded679600] mmco: unref short failure +[h264 @ 0x555df0f7cd00] mmco: unref short failure +[h264 @ 0x55d959b34680] mmco: unref short failure +[h264 @ 0x55d959b34680] mmco: unref short failure +[h264 @ 0x55d959b34680] mmco: unref short failure +[h264 @ 0x555df0f7cd00] mmco: unref short failure +[h264 @ 0x555df0f7cd00] mmco: unref short failure + [2024-11-27 14:35:55] iteration 67/ 1000 | consumed samples: 4288 | elapsed time per iteration (ms): 84730.4 | throughput per GPU (TFLOP/s/GPU): 91.0 | learning rate: 4.982430E-06 | global batch size: 64 | lm loss: 7.537538E-01 | loss scale: 1.0 | grad norm: 0.773 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555dee100dc0] mmco: unref short failure +[h264 @ 0x555dee100dc0] mmco: unref short failure +[h264 @ 0x55d956f78600] mmco: unref short failure +[h264 @ 0x55d956f78600] mmco: unref short failure +[h264 @ 0x555dee100dc0] mmco: unref short failure +[h264 @ 0x55d956f78600] mmco: unref short failure +[h264 @ 0x555dee100dc0] mmco: unref short failure +[h264 @ 0x55d956f78600] mmco: unref short failure +[h264 @ 0x555dee100dc0] mmco: unref short failure +[h264 @ 0x555dee100dc0] mmco: unref short failure +[h264 @ 0x55d956f78600] mmco: unref short failure +[h264 @ 0x55d956f78600] mmco: unref short failure + [2024-11-27 14:37:12] iteration 68/ 1000 | consumed samples: 4352 | elapsed time per iteration (ms): 76790.5 | throughput per GPU (TFLOP/s/GPU): 100.4 | learning rate: 4.981468E-06 | global batch size: 64 | lm loss: 6.872675E-01 | loss scale: 1.0 | grad norm: 0.843 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x55d95d112a80] mmco: unref short failure +[h264 @ 0x55d95d112a80] mmco: unref short failure +[h264 @ 0x555df3e6f800] mmco: unref short failure +[h264 @ 0x555df3e6f800] mmco: unref short failure +[h264 @ 0x55d95d112a80] mmco: unref short failure +[h264 @ 0x555df3e6f800] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x55d959b34680] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x55d959b34680] mmco: unref short failure +[h264 @ 0x55d959b34680] mmco: unref short failure +[h264 @ 0x555df32c1f40] mmco: unref short failure +[h264 @ 0x55d95707f900] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x55d959b34680] mmco: unref short failure + [2024-11-27 14:38:48] iteration 69/ 1000 | consumed samples: 4416 | elapsed time per iteration (ms): 95911.3 | throughput per GPU (TFLOP/s/GPU): 80.4 | learning rate: 4.980482E-06 | global batch size: 64 | lm loss: 7.328756E-01 | loss scale: 1.0 | grad norm: 0.788 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555df32c1f40] mmco: unref short failure +[h264 @ 0x555df32c1f40] mmco: unref short failure +[h264 @ 0x55d95707f900] mmco: unref short failure +[h264 @ 0x55d95707f900] mmco: unref short failure +[h264 @ 0x555df32c1f40] mmco: unref short failure +[h264 @ 0x555df32c1f40] mmco: unref short failure +[h264 @ 0x55d95707f900] mmco: unref short failure +[h264 @ 0x55d95707f900] mmco: unref short failure +[h264 @ 0x555df32c1f40] mmco: unref short failure +[h264 @ 0x555df32c1f40] mmco: unref short failure +[h264 @ 0x55d95707f900] mmco: unref short failure +[h264 @ 0x55d95707f900] mmco: unref short failure + [2024-11-27 14:41:37] iteration 70/ 1000 | consumed samples: 4480 | elapsed time per iteration (ms): 169115.5 | throughput per GPU (TFLOP/s/GPU): 45.6 | learning rate: 4.979469E-06 | global batch size: 64 | lm loss: 7.115982E-01 | loss scale: 1.0 | grad norm: 0.753 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555df3e6f800] mmco: unref short failure +[h264 @ 0x55d955cbb4c0] mmco: unref short failure +[h264 @ 0x555df3e6f800] mmco: unref short failure +[h264 @ 0x555df3e6f800] mmco: unref short failure +[h264 @ 0x55d955cbb4c0] mmco: unref short failure +[h264 @ 0x55d955cbb4c0] mmco: unref short failure +[h264 @ 0x555dec836c00] mmco: unref short failure +[h264 @ 0x555dec836c00] mmco: unref short failure +[h264 @ 0x55d956defa80] mmco: unref short failure +[h264 @ 0x55d956defa80] mmco: unref short failure +[h264 @ 0x555dec836c00] mmco: unref short failure +[h264 @ 0x555dec836c00] mmco: unref short failure +[h264 @ 0x55d956defa80] mmco: unref short failure +[h264 @ 0x55d956defa80] mmco: unref short failure +[h264 @ 0x555dec836c00] mmco: unref short failure +[h264 @ 0x55d956defa80] mmco: unref short failure + [2024-11-27 14:43:00] iteration 71/ 1000 | consumed samples: 4544 | elapsed time per iteration (ms): 82520.7 | throughput per GPU (TFLOP/s/GPU): 93.4 | learning rate: 4.978431E-06 | global batch size: 64 | lm loss: 7.005649E-01 | loss scale: 1.0 | grad norm: 0.698 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555df32c1f40] mmco: unref short failure +[h264 @ 0x55d955b8bcc0] mmco: unref short failure +[h264 @ 0x555dece38b80] mmco: unref short failure +[h264 @ 0x55d956767080] mmco: unref short failure +[h264 @ 0x555df32c1f40] mmco: unref short failure +[h264 @ 0x555df32c1f40] mmco: unref short failure +[h264 @ 0x555df32c1f40] mmco: unref short failure +[h264 @ 0x55d955b8bcc0] mmco: unref short failure +[h264 @ 0x55d955b8bcc0] mmco: unref short failure +[h264 @ 0x55d955b8bcc0] mmco: unref short failure + [2024-11-27 14:44:33] iteration 72/ 1000 | consumed samples: 4608 | elapsed time per iteration (ms): 93772.7 | throughput per GPU (TFLOP/s/GPU): 82.2 | learning rate: 4.977368E-06 | global batch size: 64 | lm loss: 8.310930E-01 | loss scale: 1.0 | grad norm: 0.999 | number of skipped iterations: 0 | number of nan iterations: 0 | + [2024-11-27 14:46:24] iteration 73/ 1000 | consumed samples: 4672 | elapsed time per iteration (ms): 111174.0 | throughput per GPU (TFLOP/s/GPU): 69.3 | learning rate: 4.976279E-06 | global batch size: 64 | lm loss: 7.123011E-01 | loss scale: 1.0 | grad norm: 0.805 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x55d956767080] mmco: unref short failure +[h264 @ 0x55d956767080] mmco: unref short failure +[h264 @ 0x555dee588f00] mmco: unref short failure +[h264 @ 0x555dee588f00] mmco: unref short failure +[h264 @ 0x55d956767080] mmco: unref short failure +[h264 @ 0x555dee588f00] mmco: unref short failure +[h264 @ 0x55d956767080] mmco: unref short failure +[h264 @ 0x55d956767080] mmco: unref short failure +[h264 @ 0x555dee588f00] mmco: unref short failure +[h264 @ 0x555dee588f00] mmco: unref short failure +[h264 @ 0x555dee588f00] mmco: unref short failure +[h264 @ 0x555dee588f00] mmco: unref short failure +[h264 @ 0x55d956767080] mmco: unref short failure +[h264 @ 0x55d956767080] mmco: unref short failure +[h264 @ 0x555dee588f00] mmco: unref short failure +[h264 @ 0x555dee588f00] mmco: unref short failure +[h264 @ 0x55d956767080] mmco: unref short failure +[h264 @ 0x55d956767080] mmco: unref short failure +[h264 @ 0x555dee588f00] mmco: unref short failure +[h264 @ 0x555dee588f00] mmco: unref short failure +[h264 @ 0x55d956767080] mmco: unref short failure +[h264 @ 0x55d956767080] mmco: unref short failure +[h264 @ 0x555dee588f00] mmco: unref short failure +[h264 @ 0x55d956767080] mmco: unref short failure +[h264 @ 0x55d95742d280] mmco: unref short failure +[h264 @ 0x55d95742d280] mmco: unref short failure +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x55d95742d280] mmco: unref short failure +[h264 @ 0x55d95742d280] mmco: unref short failure +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x55d95742d280] mmco: unref short failure +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x55d95742d280] mmco: unref short failure +[h264 @ 0x55d95742d280] mmco: unref short failure +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x55d95742d280] mmco: unref short failure +[h264 @ 0x55d95742d280] mmco: unref short failure +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x55d95742d280] mmco: unref short failure +[h264 @ 0x55d95742d280] mmco: unref short failure +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x55d95742d280] mmco: unref short failure +[h264 @ 0x55d95742d280] mmco: unref short failure +[h264 @ 0x555df32c1f40] mmco: unref short failure +[h264 @ 0x555df32c1f40] mmco: unref short failure +[h264 @ 0x55d955b8bcc0] mmco: unref short failure +[h264 @ 0x55d955b8bcc0] mmco: unref short failure + [2024-11-27 14:47:50] iteration 74/ 1000 | consumed samples: 4736 | elapsed time per iteration (ms): 85140.8 | throughput per GPU (TFLOP/s/GPU): 90.5 | learning rate: 4.975165E-06 | global batch size: 64 | lm loss: 8.185971E-01 | loss scale: 1.0 | grad norm: 0.927 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x55d956f36840] mmco: unref short failure +[h264 @ 0x55d956f36840] mmco: unref short failure +[h264 @ 0x555dee6b8180] mmco: unref short failure +[h264 @ 0x555dee6b8180] mmco: unref short failure +[h264 @ 0x55d956f36840] mmco: unref short failure +[h264 @ 0x55d956f36840] mmco: unref short failure +[h264 @ 0x555dee6b8180] mmco: unref short failure +[h264 @ 0x555dee6b8180] mmco: unref short failure +[h264 @ 0x55d956f36840] mmco: unref short failure +[h264 @ 0x555dee6b8180] mmco: unref short failure +[h264 @ 0x55d956f78600] mmco: unref short failure +[h264 @ 0x555dee100dc0] mmco: unref short failure + [2024-11-27 14:49:33] iteration 75/ 1000 | consumed samples: 4800 | elapsed time per iteration (ms): 103443.5 | throughput per GPU (TFLOP/s/GPU): 74.5 | learning rate: 4.974025E-06 | global batch size: 64 | lm loss: 6.794741E-01 | loss scale: 1.0 | grad norm: 0.943 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x55d956ee5780] mmco: unref short failure +[h264 @ 0x55d956ee5780] mmco: unref short failure +[h264 @ 0x55d956ee5780] mmco: unref short failure +[h264 @ 0x55d956ee5780] mmco: unref short failure +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x55d956ee5780] mmco: unref short failure +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x55d956ee5780] mmco: unref short failure +[h264 @ 0x55d956ee5780] mmco: unref short failure +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x555dee588f00] mmco: unref short failure +[h264 @ 0x55d956767080] mmco: unref short failure +[h264 @ 0x55d956767080] mmco: unref short failure +[h264 @ 0x55d956767080] mmco: unref short failure +[h264 @ 0x555dee588f00] mmco: unref short failure +[h264 @ 0x555dee588f00] mmco: unref short failure +[h264 @ 0x555df3e6f800] mmco: unref short failure +[h264 @ 0x555df3e6f800] mmco: unref short failure +[h264 @ 0x55d95d112a80] mmco: unref short failure +[h264 @ 0x55d95d112a80] mmco: unref short failure +[h264 @ 0x555df3e6f800] mmco: unref short failure +[h264 @ 0x555df3e6f800] mmco: unref short failure +[h264 @ 0x55d95d112a80] mmco: unref short failure +[h264 @ 0x55d95d112a80] mmco: unref short failure +[h264 @ 0x555df3e6f800] mmco: unref short failure +[h264 @ 0x55d95d112a80] mmco: unref short failure +[h264 @ 0x555df3e6f800] mmco: unref short failure +[h264 @ 0x55d95d112a80] mmco: unref short failure +[h264 @ 0x555df3e6f800] mmco: unref short failure +[h264 @ 0x555df3e6f800] mmco: unref short failure +[h264 @ 0x55d95d112a80] mmco: unref short failure +[h264 @ 0x55d95d112a80] mmco: unref short failure + [2024-11-27 14:51:01] iteration 76/ 1000 | consumed samples: 4864 | elapsed time per iteration (ms): 88112.8 | throughput per GPU (TFLOP/s/GPU): 87.5 | learning rate: 4.972860E-06 | global batch size: 64 | lm loss: 6.982243E-01 | loss scale: 1.0 | grad norm: 0.802 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555dedb7a200] mmco: unref short failure +[h264 @ 0x55d95742d280] mmco: unref short failure +[h264 @ 0x555dedb7a200] mmco: unref short failure +[h264 @ 0x555dedb7a200] mmco: unref short failure +[h264 @ 0x55d95742d280] mmco: unref short failure +[h264 @ 0x55d95742d280] mmco: unref short failure +[h264 @ 0x555dedb7a200] mmco: unref short failure +[h264 @ 0x555dedb7a200] mmco: unref short failure +[h264 @ 0x55d95742d280] mmco: unref short failure +[h264 @ 0x55d95742d280] mmco: unref short failure +[h264 @ 0x555dedb7a200] mmco: unref short failure +[h264 @ 0x55d95742d280] mmco: unref short failure +[h264 @ 0x555dedb7a200] mmco: unref short failure +[h264 @ 0x555dedb7a200] mmco: unref short failure +[h264 @ 0x55d95742d280] mmco: unref short failure +[h264 @ 0x55d95742d280] mmco: unref short failure + [2024-11-27 14:52:16] iteration 77/ 1000 | consumed samples: 4928 | elapsed time per iteration (ms): 75137.7 | throughput per GPU (TFLOP/s/GPU): 102.6 | learning rate: 4.971670E-06 | global batch size: 64 | lm loss: 7.333058E-01 | loss scale: 1.0 | grad norm: 1.006 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x55d955d53340] mmco: unref short failure +[h264 @ 0x55d955d53340] mmco: unref short failure +[h264 @ 0x555dee104680] mmco: unref short failure +[h264 @ 0x555dee104680] mmco: unref short failure +[h264 @ 0x55d955d53340] mmco: unref short failure +[h264 @ 0x555dee104680] mmco: unref short failure +[h264 @ 0x555dee104680] mmco: unref short failure +[h264 @ 0x555dee104680] mmco: unref short failure +[h264 @ 0x55d957a1cec0] mmco: unref short failure +[h264 @ 0x55d957a1cec0] mmco: unref short failure +[h264 @ 0x555dee588f00] mmco: unref short failure +[h264 @ 0x55d956767080] mmco: unref short failure + [2024-11-27 14:53:31] iteration 78/ 1000 | consumed samples: 4992 | elapsed time per iteration (ms): 74671.8 | throughput per GPU (TFLOP/s/GPU): 103.2 | learning rate: 4.970454E-06 | global batch size: 64 | lm loss: 6.710973E-01 | loss scale: 1.0 | grad norm: 0.689 | number of skipped iterations: 0 | number of nan iterations: 0 | + [2024-11-27 14:54:51] iteration 79/ 1000 | consumed samples: 5056 | elapsed time per iteration (ms): 80178.4 | throughput per GPU (TFLOP/s/GPU): 96.1 | learning rate: 4.969213E-06 | global batch size: 64 | lm loss: 6.840650E-01 | loss scale: 1.0 | grad norm: 1.106 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x55d956f36840] mmco: unref short failure +[h264 @ 0x55d956f36840] mmco: unref short failure +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x555defa21280] mmco: unref short failure +[h264 @ 0x555defa21280] mmco: unref short failure +[h264 @ 0x55d956da5240] mmco: unref short failure +[h264 @ 0x55d956da5240] mmco: unref short failure + [2024-11-27 14:56:19] iteration 80/ 1000 | consumed samples: 5120 | elapsed time per iteration (ms): 87527.5 | throughput per GPU (TFLOP/s/GPU): 88.1 | learning rate: 4.967946E-06 | global batch size: 64 | lm loss: 7.118856E-01 | loss scale: 1.0 | grad norm: 0.804 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x55d955d53340] mmco: unref short failure +[h264 @ 0x555dee104680] mmco: unref short failure +[h264 @ 0x55d955d53340] mmco: unref short failure +[h264 @ 0x555dee104680] mmco: unref short failure + [2024-11-27 14:57:29] iteration 81/ 1000 | consumed samples: 5184 | elapsed time per iteration (ms): 70098.8 | throughput per GPU (TFLOP/s/GPU): 110.0 | learning rate: 4.966654E-06 | global batch size: 64 | lm loss: 8.833607E-01 | loss scale: 1.0 | grad norm: 1.238 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555dee1d4480] mmco: unref short failure +[h264 @ 0x55d956f36840] mmco: unref short failure + [2024-11-27 14:58:50] iteration 82/ 1000 | consumed samples: 5248 | elapsed time per iteration (ms): 81244.6 | throughput per GPU (TFLOP/s/GPU): 94.9 | learning rate: 4.965337E-06 | global batch size: 64 | lm loss: 6.820184E-01 | loss scale: 1.0 | grad norm: 0.915 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555dee104680] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x55d955d53340] mmco: unref short failure +[h264 @ 0x55d959b34680] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x55d959b34680] mmco: unref short failure +[h264 @ 0x55d959b34680] mmco: unref short failure +[h264 @ 0x55d959b34680] mmco: unref short failure + [2024-11-27 15:00:24] iteration 83/ 1000 | consumed samples: 5312 | elapsed time per iteration (ms): 93369.2 | throughput per GPU (TFLOP/s/GPU): 82.6 | learning rate: 4.963994E-06 | global batch size: 64 | lm loss: 7.110070E-01 | loss scale: 1.0 | grad norm: 0.829 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x55d956767080] mmco: unref short failure +[h264 @ 0x55d956767080] mmco: unref short failure +[h264 @ 0x555dee1d4480] mmco: unref short failure +[h264 @ 0x555dee1d4480] mmco: unref short failure +[h264 @ 0x55d956767080] mmco: unref short failure +[h264 @ 0x555dee1d4480] mmco: unref short failure +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x55d956f36840] mmco: unref short failure +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x55d956f36840] mmco: unref short failure +[h264 @ 0x55d956f36840] mmco: unref short failure +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x55d956f36840] mmco: unref short failure +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x55d956f36840] mmco: unref short failure +[h264 @ 0x55d956f36840] mmco: unref short failure + [2024-11-27 15:01:46] iteration 84/ 1000 | consumed samples: 5376 | elapsed time per iteration (ms): 81995.1 | throughput per GPU (TFLOP/s/GPU): 94.0 | learning rate: 4.962626E-06 | global batch size: 64 | lm loss: 7.219861E-01 | loss scale: 1.0 | grad norm: 0.873 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x55d956767080] mmco: unref short failure +[h264 @ 0x555dee1d4480] mmco: unref short failure + [2024-11-27 15:03:32] iteration 85/ 1000 | consumed samples: 5440 | elapsed time per iteration (ms): 106733.3 | throughput per GPU (TFLOP/s/GPU): 72.2 | learning rate: 4.961232E-06 | global batch size: 64 | lm loss: 6.973246E-01 | loss scale: 1.0 | grad norm: 0.815 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x55d956767080] mmco: unref short failure +[h264 @ 0x55d956767080] mmco: unref short failure +[h264 @ 0x555dee588f00] mmco: unref short failure +[h264 @ 0x555dee588f00] mmco: unref short failure +[h264 @ 0x55d956767080] mmco: unref short failure +[h264 @ 0x55d956767080] mmco: unref short failure +[h264 @ 0x555dee588f00] mmco: unref short failure +[h264 @ 0x555dee588f00] mmco: unref short failure +[h264 @ 0x555dec811980] mmco: unref short failure +[h264 @ 0x55d95873d100] mmco: unref short failure + [2024-11-27 15:04:48] iteration 86/ 1000 | consumed samples: 5504 | elapsed time per iteration (ms): 75544.6 | throughput per GPU (TFLOP/s/GPU): 102.0 | learning rate: 4.959814E-06 | global batch size: 64 | lm loss: 6.851219E-01 | loss scale: 1.0 | grad norm: 0.754 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x55d955d4f640] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x55d955d4f640] mmco: unref short failure +[h264 @ 0x55d95873d100] mmco: unref short failure +[h264 @ 0x55d95873d100] mmco: unref short failure +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x55d956f36840] mmco: unref short failure +[h264 @ 0x55d956f36840] mmco: unref short failure +[h264 @ 0x555dee1d4480] mmco: unref short failure +[h264 @ 0x555dee1d4480] mmco: unref short failure + [2024-11-27 15:06:04] iteration 87/ 1000 | consumed samples: 5568 | elapsed time per iteration (ms): 76222.9 | throughput per GPU (TFLOP/s/GPU): 101.1 | learning rate: 4.958370E-06 | global batch size: 64 | lm loss: 8.057975E-01 | loss scale: 1.0 | grad norm: 1.777 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x55d955d4f640] mmco: unref short failure +[h264 @ 0x55d955d4f640] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x55d955d4f640] mmco: unref short failure +[h264 @ 0x55d955d4f640] mmco: unref short failure +[h264 @ 0x55d955d4f640] mmco: unref short failure + [2024-11-27 15:07:34] iteration 88/ 1000 | consumed samples: 5632 | elapsed time per iteration (ms): 90322.7 | throughput per GPU (TFLOP/s/GPU): 85.3 | learning rate: 4.956901E-06 | global batch size: 64 | lm loss: 7.231341E-01 | loss scale: 1.0 | grad norm: 1.386 | number of skipped iterations: 0 | number of nan iterations: 0 | + [2024-11-27 15:08:55] iteration 89/ 1000 | consumed samples: 5696 | elapsed time per iteration (ms): 80229.7 | throughput per GPU (TFLOP/s/GPU): 96.1 | learning rate: 4.955406E-06 | global batch size: 64 | lm loss: 7.316175E-01 | loss scale: 1.0 | grad norm: 1.309 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555dee1d4480] mmco: unref short failure +[h264 @ 0x555dee1d4480] mmco: unref short failure +[h264 @ 0x55d956767080] mmco: unref short failure +[h264 @ 0x55d956767080] mmco: unref short failure +[h264 @ 0x555ded0fd000] mmco: unref short failure +[h264 @ 0x55d9568d0900] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x55d955d4f640] mmco: unref short failure +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x55d95d112a80] mmco: unref short failure +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x55d95d112a80] mmco: unref short failure +[h264 @ 0x55d95d112a80] mmco: unref short failure +[h264 @ 0x55d955d4f640] mmco: unref short failure +[h264 @ 0x55d955d4f640] mmco: unref short failure +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x55d95d112a80] mmco: unref short failure +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x55d95d112a80] mmco: unref short failure + [2024-11-27 15:10:08] iteration 90/ 1000 | consumed samples: 5760 | elapsed time per iteration (ms): 73185.2 | throughput per GPU (TFLOP/s/GPU): 105.3 | learning rate: 4.953887E-06 | global batch size: 64 | lm loss: 6.497885E-01 | loss scale: 1.0 | grad norm: 0.729 | number of skipped iterations: 0 | number of nan iterations: 0 | + [2024-11-27 15:11:13] iteration 91/ 1000 | consumed samples: 5824 | elapsed time per iteration (ms): 64640.5 | throughput per GPU (TFLOP/s/GPU): 119.3 | learning rate: 4.952342E-06 | global batch size: 64 | lm loss: 6.669019E-01 | loss scale: 1.0 | grad norm: 0.894 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555dee1d4480] mmco: unref short failure +[h264 @ 0x55d956767080] mmco: unref short failure +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x55d956d792c0] mmco: unref short failure +[h264 @ 0x55d956d792c0] mmco: unref short failure +[h264 @ 0x55d95c76c1c0] mmco: unref short failure +[h264 @ 0x55d95c76c1c0] mmco: unref short failure +[h264 @ 0x555df3e6f800] mmco: unref short failure +[h264 @ 0x555df3e6f800] mmco: unref short failure +[h264 @ 0x55d95c76c1c0] mmco: unref short failure +[h264 @ 0x555df3e6f800] mmco: unref short failure +[h264 @ 0x55d95c76c1c0] mmco: unref short failure +[h264 @ 0x55d95c76c1c0] mmco: unref short failure +[h264 @ 0x555df3e6f800] mmco: unref short failure +[h264 @ 0x555df3e6f800] mmco: unref short failure +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x55d956d792c0] mmco: unref short failure +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x55d956d792c0] mmco: unref short failure +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x55d956d792c0] mmco: unref short failure +[h264 @ 0x55d956d792c0] mmco: unref short failure +[h264 @ 0x555dee1d4480] mmco: unref short failure +[h264 @ 0x555dee1d4480] mmco: unref short failure +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x55d956d792c0] mmco: unref short failure +[h264 @ 0x55d956767080] mmco: unref short failure +[h264 @ 0x55d956767080] mmco: unref short failure +[h264 @ 0x555dee1d4480] mmco: unref short failure +[h264 @ 0x55d956767080] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x55d957f0f640] mmco: unref short failure + [2024-11-27 15:12:37] iteration 92/ 1000 | consumed samples: 5888 | elapsed time per iteration (ms): 84408.5 | throughput per GPU (TFLOP/s/GPU): 91.3 | learning rate: 4.950772E-06 | global batch size: 64 | lm loss: 7.155092E-01 | loss scale: 1.0 | grad norm: 0.800 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x55d95c76c1c0] mmco: unref short failure +[h264 @ 0x555df3e6f800] mmco: unref short failure +[h264 @ 0x55d95c76c1c0] mmco: unref short failure +[h264 @ 0x555df3e6f800] mmco: unref short failure +[h264 @ 0x555df3e6f800] mmco: unref short failure +[h264 @ 0x555df3e6f800] mmco: unref short failure +[h264 @ 0x55d95c76c1c0] mmco: unref short failure +[h264 @ 0x55d95c76c1c0] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x55d957883f80] mmco: unref short failure +[h264 @ 0x55d957883f80] mmco: unref short failure + [2024-11-27 15:13:55] iteration 93/ 1000 | consumed samples: 5952 | elapsed time per iteration (ms): 78195.1 | throughput per GPU (TFLOP/s/GPU): 98.6 | learning rate: 4.949176E-06 | global batch size: 64 | lm loss: 6.895306E-01 | loss scale: 1.0 | grad norm: 0.964 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x55d956767080] mmco: unref short failure +[h264 @ 0x555dee1d4480] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x55d957883f80] mmco: unref short failure +[h264 @ 0x55d957883f80] mmco: unref short failure +[h264 @ 0x55d956767080] mmco: unref short failure +[h264 @ 0x555dee1d4480] mmco: unref short failure +[h264 @ 0x55d956767080] mmco: unref short failure +[h264 @ 0x55d956767080] mmco: unref short failure +[h264 @ 0x555dee1d4480] mmco: unref short failure +[h264 @ 0x555dee1d4480] mmco: unref short failure +[h264 @ 0x55d95c76c1c0] mmco: unref short failure +[h264 @ 0x55d95c76c1c0] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure + [2024-11-27 15:15:17] iteration 94/ 1000 | consumed samples: 6016 | elapsed time per iteration (ms): 82313.0 | throughput per GPU (TFLOP/s/GPU): 93.6 | learning rate: 4.947556E-06 | global batch size: 64 | lm loss: 7.441191E-01 | loss scale: 1.0 | grad norm: 33.727 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555defd60880] mmco: unref short failure +[h264 @ 0x55d9568d0900] mmco: unref short failure + [2024-11-27 15:16:42] iteration 95/ 1000 | consumed samples: 6080 | elapsed time per iteration (ms): 84126.4 | throughput per GPU (TFLOP/s/GPU): 91.6 | learning rate: 4.945910E-06 | global batch size: 64 | lm loss: 7.333177E-01 | loss scale: 1.0 | grad norm: 0.825 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555df3e6f800] mmco: unref short failure +[h264 @ 0x55d95c76c1c0] mmco: unref short failure +[h264 @ 0x555df3e6f800] mmco: unref short failure +[h264 @ 0x55d95c76c1c0] mmco: unref short failure +[h264 @ 0x555df3e6f800] mmco: unref short failure +[h264 @ 0x55d95c76c1c0] mmco: unref short failure + [2024-11-27 15:18:12] iteration 96/ 1000 | consumed samples: 6144 | elapsed time per iteration (ms): 90870.2 | throughput per GPU (TFLOP/s/GPU): 84.8 | learning rate: 4.944240E-06 | global batch size: 64 | lm loss: 7.676827E-01 | loss scale: 1.0 | grad norm: 0.850 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x55d957883f80] mmco: unref short failure +[h264 @ 0x555dee104680] mmco: unref short failure +[h264 @ 0x55d9586ad200] mmco: unref short failure +[h264 @ 0x55d9586ad200] mmco: unref short failure +[h264 @ 0x55d9586ad200] mmco: unref short failure +[h264 @ 0x555dee104680] mmco: unref short failure +[h264 @ 0x555dee104680] mmco: unref short failure +[h264 @ 0x555defd60880] mmco: unref short failure +[h264 @ 0x555defd60880] mmco: unref short failure +[h264 @ 0x55d9568d0900] mmco: unref short failure +[h264 @ 0x55d9568d0900] mmco: unref short failure +[h264 @ 0x55d9586ad200] mmco: unref short failure +[h264 @ 0x555dee104680] mmco: unref short failure +[h264 @ 0x555defd60880] mmco: unref short failure +[h264 @ 0x555defd60880] mmco: unref short failure +[h264 @ 0x555dee104680] mmco: unref short failure +[h264 @ 0x555dee104680] mmco: unref short failure +[h264 @ 0x55d9586ad200] mmco: unref short failure +[h264 @ 0x55d9586ad200] mmco: unref short failure +[h264 @ 0x55d9568d0900] mmco: unref short failure +[h264 @ 0x55d9568d0900] mmco: unref short failure +[h264 @ 0x555defd60880] mmco: unref short failure +[h264 @ 0x55d9568d0900] mmco: unref short failure + [2024-11-27 15:19:41] iteration 97/ 1000 | consumed samples: 6208 | elapsed time per iteration (ms): 88465.1 | throughput per GPU (TFLOP/s/GPU): 87.1 | learning rate: 4.942544E-06 | global batch size: 64 | lm loss: 7.284559E-01 | loss scale: 1.0 | grad norm: 0.784 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555dee1d4480] mmco: unref short failure +[h264 @ 0x55d956767080] mmco: unref short failure +[h264 @ 0x555dee104680] mmco: unref short failure +[h264 @ 0x55d95678a600] mmco: unref short failure +[h264 @ 0x555dee1d4480] mmco: unref short failure +[h264 @ 0x555dee1d4480] mmco: unref short failure +[h264 @ 0x555dee104680] mmco: unref short failure +[h264 @ 0x555dee1d4480] mmco: unref short failure +[h264 @ 0x555dee1d4480] mmco: unref short failure +[h264 @ 0x55d956767080] mmco: unref short failure +[h264 @ 0x55d956767080] mmco: unref short failure +[h264 @ 0x55d95678a600] mmco: unref short failure +[h264 @ 0x55d956767080] mmco: unref short failure +[h264 @ 0x55d956767080] mmco: unref short failure +[h264 @ 0x555dee1d4480] mmco: unref short failure +[h264 @ 0x555dee1d4480] mmco: unref short failure +[h264 @ 0x55d956767080] mmco: unref short failure +[h264 @ 0x55d956767080] mmco: unref short failure + [2024-11-27 15:21:07] iteration 98/ 1000 | consumed samples: 6272 | elapsed time per iteration (ms): 85811.5 | throughput per GPU (TFLOP/s/GPU): 89.8 | learning rate: 4.940823E-06 | global batch size: 64 | lm loss: 6.762350E-01 | loss scale: 1.0 | grad norm: 0.800 | number of skipped iterations: 0 | number of nan iterations: 0 | + [2024-11-27 15:22:32] iteration 99/ 1000 | consumed samples: 6336 | elapsed time per iteration (ms): 85430.6 | throughput per GPU (TFLOP/s/GPU): 90.2 | learning rate: 4.939077E-06 | global batch size: 64 | lm loss: 7.216333E-01 | loss scale: 1.0 | grad norm: 0.957 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x55d95a292180] mmco: unref short failure +[h264 @ 0x55d95a292180] mmco: unref short failure +[h264 @ 0x555ded4ed500] mmco: unref short failure +[h264 @ 0x555ded4ed500] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x55d957883f80] mmco: unref short failure +[h264 @ 0x55d957883f80] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x55d957883f80] mmco: unref short failure +[h264 @ 0x55d957883f80] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x55d957883f80] mmco: unref short failure +[h264 @ 0x55d957883f80] mmco: unref short failure +[h264 @ 0x555dee1d4480] mmco: unref short failure +[h264 @ 0x55d956767080] mmco: unref short failure +[h264 @ 0x555dee1d4480] mmco: unref short failure +[h264 @ 0x55d956767080] mmco: unref short failure +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x55d956f36840] mmco: unref short failure +[h264 @ 0x55d956767080] mmco: unref short failure +[h264 @ 0x555dee1d4480] mmco: unref short failure +[h264 @ 0x55d956767080] mmco: unref short failure +[h264 @ 0x55d956767080] mmco: unref short failure +[h264 @ 0x555dee1d4480] mmco: unref short failure +[h264 @ 0x555dee1d4480] mmco: unref short failure +[h264 @ 0x55d956767080] mmco: unref short failure +[h264 @ 0x555dee1d4480] mmco: unref short failure +[h264 @ 0x55d956767080] mmco: unref short failure +[h264 @ 0x55d956767080] mmco: unref short failure +[h264 @ 0x555dee1d4480] mmco: unref short failure +[h264 @ 0x555dee1d4480] mmco: unref short failure +[h264 @ 0x55d956767080] mmco: unref short failure +[h264 @ 0x555dee1d4480] mmco: unref short failure +[h264 @ 0x555dee1d4480] mmco: unref short failure +[h264 @ 0x55d956767080] mmco: unref short failure + [2024-11-27 15:24:10] iteration 100/ 1000 | consumed samples: 6400 | elapsed time per iteration (ms): 97707.6 | throughput per GPU (TFLOP/s/GPU): 78.9 | learning rate: 4.937306E-06 | global batch size: 64 | lm loss: 7.216407E-01 | loss scale: 1.0 | grad norm: 0.771 | number of skipped iterations: 0 | number of nan iterations: 0 | +(min, max) time across ranks (ms): + save-checkpoint ................................: (204044.53, 204044.86) +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x55d957883f80] mmco: unref short failure +[h264 @ 0x555df32c1f40] mmco: unref short failure +[h264 @ 0x555df32c1f40] mmco: unref short failure +[h264 @ 0x55d955e4acc0] mmco: unref short failure +[h264 @ 0x55d955e4acc0] mmco: unref short failure + [2024-11-27 15:29:05] iteration 101/ 1000 | consumed samples: 6464 | elapsed time per iteration (ms): 91260.4 | throughput per GPU (TFLOP/s/GPU): 84.5 | learning rate: 4.935510E-06 | global batch size: 64 | lm loss: 6.965908E-01 | loss scale: 1.0 | grad norm: 0.889 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555dee104680] mmco: unref short failure +[h264 @ 0x555dee104680] mmco: unref short failure +[h264 @ 0x55d955d53340] mmco: unref short failure +[h264 @ 0x55d955d53340] mmco: unref short failure +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x55d957a1cec0] mmco: unref short failure +[h264 @ 0x55d957a1cec0] mmco: unref short failure +[h264 @ 0x555dee1d4480] mmco: unref short failure +[h264 @ 0x55d956767080] mmco: unref short failure +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x555dee104680] mmco: unref short failure +[h264 @ 0x55d957a1cec0] mmco: unref short failure +[h264 @ 0x55d955d53340] mmco: unref short failure +[h264 @ 0x555dee104680] mmco: unref short failure +[h264 @ 0x55d955d53340] mmco: unref short failure +[h264 @ 0x555dee1d4480] mmco: unref short failure +[h264 @ 0x555dee1d4480] mmco: unref short failure +[h264 @ 0x55d956767080] mmco: unref short failure +[h264 @ 0x55d956767080] mmco: unref short failure +[h264 @ 0x555dece4d600] mmco: unref short failure +[h264 @ 0x555dece4d600] mmco: unref short failure +[h264 @ 0x55d957a1cec0] mmco: unref short failure +[h264 @ 0x55d957a1cec0] mmco: unref short failure + [2024-11-27 15:31:09] iteration 102/ 1000 | consumed samples: 6528 | elapsed time per iteration (ms): 123298.8 | throughput per GPU (TFLOP/s/GPU): 62.5 | learning rate: 4.933689E-06 | global batch size: 64 | lm loss: 7.452544E-01 | loss scale: 1.0 | grad norm: 1.219 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x55d956f36840] mmco: unref short failure +[h264 @ 0x555defd60880] mmco: unref short failure +[h264 @ 0x55d9568d0900] mmco: unref short failure + [2024-11-27 15:32:34] iteration 103/ 1000 | consumed samples: 6592 | elapsed time per iteration (ms): 85714.0 | throughput per GPU (TFLOP/s/GPU): 89.9 | learning rate: 4.931842E-06 | global batch size: 64 | lm loss: 6.922635E-01 | loss scale: 1.0 | grad norm: 0.806 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555dee1d4480] mmco: unref short failure +[h264 @ 0x555dee104680] mmco: unref short failure +[h264 @ 0x555dee104680] mmco: unref short failure +[h264 @ 0x55d956767080] mmco: unref short failure +[h264 @ 0x555dee104680] mmco: unref short failure +[h264 @ 0x555dee104680] mmco: unref short failure +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x55d956da5240] mmco: unref short failure +[h264 @ 0x55d956da5240] mmco: unref short failure +[h264 @ 0x55d956da5240] mmco: unref short failure +[h264 @ 0x55d956da5240] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x555dee104680] mmco: unref short failure +[h264 @ 0x555dee104680] mmco: unref short failure +[h264 @ 0x55d956da5240] mmco: unref short failure +[h264 @ 0x55d956da5240] mmco: unref short failure +[h264 @ 0x555dee104680] mmco: unref short failure +[h264 @ 0x555dee104680] mmco: unref short failure +[h264 @ 0x55d956da5240] mmco: unref short failure +[h264 @ 0x55d956da5240] mmco: unref short failure +[h264 @ 0x555dee6ec240] mmco: unref short failure +[h264 @ 0x55d95a292180] mmco: unref short failure +[h264 @ 0x555dee104680] mmco: unref short failure +[h264 @ 0x55d956da5240] mmco: unref short failure + [2024-11-27 15:34:00] iteration 104/ 1000 | consumed samples: 6656 | elapsed time per iteration (ms): 85261.0 | throughput per GPU (TFLOP/s/GPU): 90.4 | learning rate: 4.929971E-06 | global batch size: 64 | lm loss: 6.881779E-01 | loss scale: 1.0 | grad norm: 0.936 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x55d956f36840] mmco: unref short failure +[h264 @ 0x55d956f36840] mmco: unref short failure +[h264 @ 0x55d956f36840] mmco: unref short failure +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x555dee104680] mmco: unref short failure +[h264 @ 0x55d95678a600] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x555defd60880] mmco: unref short failure +[h264 @ 0x555defd60880] mmco: unref short failure +[h264 @ 0x55d9568d0900] mmco: unref short failure +[h264 @ 0x55d9568d0900] mmco: unref short failure +[h264 @ 0x55d957883f80] mmco: unref short failure +[h264 @ 0x555defd60880] mmco: unref short failure +[h264 @ 0x555defd60880] mmco: unref short failure +[h264 @ 0x55d9568d0900] mmco: unref short failure +[h264 @ 0x55d9568d0900] mmco: unref short failure + [2024-11-27 15:35:39] iteration 105/ 1000 | consumed samples: 6720 | elapsed time per iteration (ms): 99837.9 | throughput per GPU (TFLOP/s/GPU): 77.2 | learning rate: 4.928075E-06 | global batch size: 64 | lm loss: 7.913043E-01 | loss scale: 1.0 | grad norm: 16.914 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555dee104680] mmco: unref short failure +[h264 @ 0x555dee104680] mmco: unref short failure +[h264 @ 0x55d95678a600] mmco: unref short failure +[h264 @ 0x55d95678a600] mmco: unref short failure +[h264 @ 0x55d957883f80] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure + [2024-11-27 15:37:17] iteration 106/ 1000 | consumed samples: 6784 | elapsed time per iteration (ms): 97721.0 | throughput per GPU (TFLOP/s/GPU): 78.9 | learning rate: 4.926154E-06 | global batch size: 64 | lm loss: 7.194221E-01 | loss scale: 1.0 | grad norm: 0.890 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555dece4d600] mmco: unref short failure +[h264 @ 0x55d957a91640] mmco: unref short failure + [2024-11-27 15:38:50] iteration 107/ 1000 | consumed samples: 6848 | elapsed time per iteration (ms): 92398.7 | throughput per GPU (TFLOP/s/GPU): 83.4 | learning rate: 4.924208E-06 | global batch size: 64 | lm loss: 7.140918E-01 | loss scale: 1.0 | grad norm: 0.849 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555dee6ec240] mmco: unref short failure +[h264 @ 0x555dee6ec240] mmco: unref short failure +[h264 @ 0x55d95a292180] mmco: unref short failure +[h264 @ 0x55d95a292180] mmco: unref short failure +[h264 @ 0x555defd60880] mmco: unref short failure +[h264 @ 0x555defd60880] mmco: unref short failure +[h264 @ 0x55d9568d0900] mmco: unref short failure +[h264 @ 0x55d9568d0900] mmco: unref short failure +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x55d956f36840] mmco: unref short failure +[h264 @ 0x55d956f36840] mmco: unref short failure + [2024-11-27 15:40:03] iteration 108/ 1000 | consumed samples: 6912 | elapsed time per iteration (ms): 73276.4 | throughput per GPU (TFLOP/s/GPU): 105.2 | learning rate: 4.922237E-06 | global batch size: 64 | lm loss: 6.261149E-01 | loss scale: 1.0 | grad norm: 0.789 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x55d957883f80] mmco: unref short failure +[h264 @ 0x55d957883f80] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x55d956f36840] mmco: unref short failure +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x55d956f36840] mmco: unref short failure +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x55d956defa80] mmco: unref short failure +[h264 @ 0x555def251b40] mmco: unref short failure +[h264 @ 0x55d956b88f40] mmco: unref short failure +[h264 @ 0x55d956b88f40] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x55d956d792c0] mmco: unref short failure +[h264 @ 0x555def251b40] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x55d956b88f40] mmco: unref short failure +[h264 @ 0x55d956f36840] mmco: unref short failure +[h264 @ 0x55d956f36840] mmco: unref short failure +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x55d956d792c0] mmco: unref short failure +[h264 @ 0x55d956d792c0] mmco: unref short failure +[h264 @ 0x555def251b40] mmco: unref short failure +[h264 @ 0x555def251b40] mmco: unref short failure +[h264 @ 0x55d956d792c0] mmco: unref short failure +[h264 @ 0x555def251b40] mmco: unref short failure +[h264 @ 0x55d956d792c0] mmco: unref short failure +[h264 @ 0x555def251b40] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x55d956b88f40] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x55d956b88f40] mmco: unref short failure +[h264 @ 0x555dedf05880] mmco: unref short failure +[h264 @ 0x555dedf05880] mmco: unref short failure +[h264 @ 0x55d957f0f640] mmco: unref short failure +[h264 @ 0x55d957f0f640] mmco: unref short failure + [2024-11-27 15:41:20] iteration 109/ 1000 | consumed samples: 6976 | elapsed time per iteration (ms): 76692.5 | throughput per GPU (TFLOP/s/GPU): 100.5 | learning rate: 4.920242E-06 | global batch size: 64 | lm loss: 6.905310E-01 | loss scale: 1.0 | grad norm: 0.773 | number of skipped iterations: 0 | number of nan iterations: 0 | + [2024-11-27 15:42:40] iteration 110/ 1000 | consumed samples: 7040 | elapsed time per iteration (ms): 79999.4 | throughput per GPU (TFLOP/s/GPU): 96.4 | learning rate: 4.918221E-06 | global batch size: 64 | lm loss: 7.688470E-01 | loss scale: 1.0 | grad norm: 238.888 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x55d956767080] mmco: unref short failure +[h264 @ 0x555dee1d4480] mmco: unref short failure +[h264 @ 0x55d95707f900] mmco: unref short failure +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x55d95707f900] mmco: unref short failure + [2024-11-27 15:43:52] iteration 111/ 1000 | consumed samples: 7104 | elapsed time per iteration (ms): 72378.0 | throughput per GPU (TFLOP/s/GPU): 106.5 | learning rate: 4.916176E-06 | global batch size: 64 | lm loss: 7.495630E-01 | loss scale: 1.0 | grad norm: 0.807 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x55d957a1cec0] mmco: unref short failure +[h264 @ 0x55d957a1cec0] mmco: unref short failure +[h264 @ 0x555deee52400] mmco: unref short failure +[h264 @ 0x555deee52400] mmco: unref short failure +[h264 @ 0x55d957a1cec0] mmco: unref short failure +[h264 @ 0x555deee52400] mmco: unref short failure +[h264 @ 0x55d95678a600] mmco: unref short failure +[h264 @ 0x555decb36640] mmco: unref short failure +[h264 @ 0x555dedf05880] mmco: unref short failure +[h264 @ 0x55d95873d100] mmco: unref short failure + [2024-11-27 15:45:23] iteration 112/ 1000 | consumed samples: 7168 | elapsed time per iteration (ms): 90592.2 | throughput per GPU (TFLOP/s/GPU): 85.1 | learning rate: 4.914105E-06 | global batch size: 64 | lm loss: 6.877882E-01 | loss scale: 1.0 | grad norm: 0.830 | number of skipped iterations: 0 | number of nan iterations: 0 | + [2024-11-27 15:46:40] iteration 113/ 1000 | consumed samples: 7232 | elapsed time per iteration (ms): 77005.3 | throughput per GPU (TFLOP/s/GPU): 100.1 | learning rate: 4.912010E-06 | global batch size: 64 | lm loss: 6.341010E-01 | loss scale: 1.0 | grad norm: 0.895 | number of skipped iterations: 0 | number of nan iterations: 0 | + [2024-11-27 15:48:20] iteration 114/ 1000 | consumed samples: 7296 | elapsed time per iteration (ms): 100122.3 | throughput per GPU (TFLOP/s/GPU): 77.0 | learning rate: 4.909890E-06 | global batch size: 64 | lm loss: 6.689736E-01 | loss scale: 1.0 | grad norm: 0.783 | number of skipped iterations: 0 | number of nan iterations: 0 | + [2024-11-27 15:49:36] iteration 115/ 1000 | consumed samples: 7360 | elapsed time per iteration (ms): 76015.8 | throughput per GPU (TFLOP/s/GPU): 101.4 | learning rate: 4.907746E-06 | global batch size: 64 | lm loss: 7.142720E-01 | loss scale: 1.0 | grad norm: 2.804 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x55d95707f900] mmco: unref short failure +[h264 @ 0x55d95707f900] mmco: unref short failure +[h264 @ 0x55d956b88f40] mmco: unref short failure +[h264 @ 0x555dedf1d500] mmco: unref short failure +[h264 @ 0x55d956b88f40] mmco: unref short failure +[h264 @ 0x55d956b88f40] mmco: unref short failure +[h264 @ 0x555dedf1d500] mmco: unref short failure +[h264 @ 0x555dedf1d500] mmco: unref short failure +[h264 @ 0x55d95678a600] mmco: unref short failure +[h264 @ 0x55d95678a600] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x55d95678a600] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x55d95678a600] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x55d95678a600] mmco: unref short failure +[h264 @ 0x55d95678a600] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x55d95678a600] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x55d95678a600] mmco: unref short failure +[h264 @ 0x55d95678a600] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x555ded696c00] mmco: unref short failure +[h264 @ 0x55d9569edbc0] mmco: unref short failure +[h264 @ 0x555ded696c00] mmco: unref short failure +[h264 @ 0x555ded696c00] mmco: unref short failure +[h264 @ 0x555ded696c00] mmco: unref short failure +[h264 @ 0x555ded696c00] mmco: unref short failure +[h264 @ 0x55d9569edbc0] mmco: unref short failure +[h264 @ 0x55d9569edbc0] mmco: unref short failure +[h264 @ 0x55d9569edbc0] mmco: unref short failure +[h264 @ 0x55d9569edbc0] mmco: unref short failure +[h264 @ 0x555defd60880] mmco: unref short failure +[h264 @ 0x55d955b8bcc0] mmco: unref short failure + [2024-11-27 15:51:09] iteration 116/ 1000 | consumed samples: 7424 | elapsed time per iteration (ms): 93186.2 | throughput per GPU (TFLOP/s/GPU): 82.7 | learning rate: 4.905577E-06 | global batch size: 64 | lm loss: 7.067767E-01 | loss scale: 1.0 | grad norm: 0.931 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x55d9568d0900] mmco: unref short failure +[h264 @ 0x555dedf1d500] mmco: unref short failure + [2024-11-27 15:52:52] iteration 117/ 1000 | consumed samples: 7488 | elapsed time per iteration (ms): 102965.8 | throughput per GPU (TFLOP/s/GPU): 74.9 | learning rate: 4.903383E-06 | global batch size: 64 | lm loss: 7.815012E-01 | loss scale: 1.0 | grad norm: 0.923 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555ded696c00] mmco: unref short failure +[h264 @ 0x55d956b88f40] mmco: unref short failure + [2024-11-27 15:54:06] iteration 118/ 1000 | consumed samples: 7552 | elapsed time per iteration (ms): 73738.2 | throughput per GPU (TFLOP/s/GPU): 104.5 | learning rate: 4.901164E-06 | global batch size: 64 | lm loss: 7.462224E-01 | loss scale: 1.0 | grad norm: 1.016 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555ded696c00] mmco: unref short failure +[h264 @ 0x55d956b88f40] mmco: unref short failure +[h264 @ 0x555ded696c00] mmco: unref short failure +[h264 @ 0x55d956b88f40] mmco: unref short failure +[h264 @ 0x555ded679600] mmco: unref short failure +[h264 @ 0x55d956f98f80] mmco: unref short failure +[h264 @ 0x555ded679600] mmco: unref short failure +[h264 @ 0x555ded679600] mmco: unref short failure +[h264 @ 0x55d956f98f80] mmco: unref short failure +[h264 @ 0x55d956f98f80] mmco: unref short failure +[h264 @ 0x555ded679600] mmco: unref short failure +[h264 @ 0x555ded679600] mmco: unref short failure +[h264 @ 0x55d95d112a80] mmco: unref short failure +[h264 @ 0x55d95d112a80] mmco: unref short failure + [2024-11-27 15:55:31] iteration 119/ 1000 | consumed samples: 7616 | elapsed time per iteration (ms): 85789.3 | throughput per GPU (TFLOP/s/GPU): 89.9 | learning rate: 4.898920E-06 | global batch size: 64 | lm loss: 6.693496E-01 | loss scale: 1.0 | grad norm: 0.775 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555dee100dc0] mmco: unref short failure +[h264 @ 0x555dee100dc0] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x555dee100dc0] mmco: unref short failure +[h264 @ 0x555defd60880] mmco: unref short failure +[h264 @ 0x555defd60880] mmco: unref short failure +[h264 @ 0x555defd60880] mmco: unref short failure +[h264 @ 0x555defd60880] mmco: unref short failure +[h264 @ 0x555dee100dc0] mmco: unref short failure +[h264 @ 0x555dee100dc0] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x55d955b8bcc0] mmco: unref short failure +[h264 @ 0x55d955b8bcc0] mmco: unref short failure +[h264 @ 0x55d955b8bcc0] mmco: unref short failure +[h264 @ 0x55d955b8bcc0] mmco: unref short failure +[h264 @ 0x555dee100dc0] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x555dee100dc0] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x555defd60880] mmco: unref short failure +[h264 @ 0x555defd60880] mmco: unref short failure +[h264 @ 0x55d955b8bcc0] mmco: unref short failure +[h264 @ 0x55d955b8bcc0] mmco: unref short failure +[h264 @ 0x555defd60880] mmco: unref short failure +[h264 @ 0x555defd60880] mmco: unref short failure +[h264 @ 0x55d955b8bcc0] mmco: unref short failure +[h264 @ 0x55d955b8bcc0] mmco: unref short failure +[h264 @ 0x555defd60880] mmco: unref short failure +[h264 @ 0x555defd60880] mmco: unref short failure +[h264 @ 0x55d955b8bcc0] mmco: unref short failure +[h264 @ 0x55d955b8bcc0] mmco: unref short failure + [2024-11-27 15:56:58] iteration 120/ 1000 | consumed samples: 7680 | elapsed time per iteration (ms): 86049.7 | throughput per GPU (TFLOP/s/GPU): 89.6 | learning rate: 4.896652E-06 | global batch size: 64 | lm loss: 6.334546E-01 | loss scale: 1.0 | grad norm: 0.820 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555ded679600] mmco: unref short failure +[h264 @ 0x555ded679600] mmco: unref short failure +[h264 @ 0x55d955e4acc0] mmco: unref short failure +[h264 @ 0x55d955e4acc0] mmco: unref short failure +[h264 @ 0x555defd60880] mmco: unref short failure +[h264 @ 0x55d9568d0900] mmco: unref short failure +[h264 @ 0x555dee588f00] mmco: unref short failure +[h264 @ 0x555dee588f00] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x555defd60880] mmco: unref short failure +[h264 @ 0x555defd60880] mmco: unref short failure +[h264 @ 0x55d9568d0900] mmco: unref short failure +[h264 @ 0x55d9568d0900] mmco: unref short failure +[h264 @ 0x555dee588f00] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x555defd60880] mmco: unref short failure +[h264 @ 0x555defd60880] mmco: unref short failure +[h264 @ 0x55d9568d0900] mmco: unref short failure +[h264 @ 0x55d9568d0900] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x555dee100dc0] mmco: unref short failure +[h264 @ 0x555dee100dc0] mmco: unref short failure + [2024-11-27 16:00:27] iteration 121/ 1000 | consumed samples: 7744 | elapsed time per iteration (ms): 209154.2 | throughput per GPU (TFLOP/s/GPU): 36.9 | learning rate: 4.894360E-06 | global batch size: 64 | lm loss: 6.660936E-01 | loss scale: 1.0 | grad norm: 1.144 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x555dee100dc0] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x555dee100dc0] mmco: unref short failure +[h264 @ 0x555dee100dc0] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure + [2024-11-27 16:02:16] iteration 122/ 1000 | consumed samples: 7808 | elapsed time per iteration (ms): 109483.9 | throughput per GPU (TFLOP/s/GPU): 70.4 | learning rate: 4.892043E-06 | global batch size: 64 | lm loss: 6.861758E-01 | loss scale: 1.0 | grad norm: 0.732 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x55d95873d100] mmco: unref short failure + [2024-11-27 16:03:48] iteration 123/ 1000 | consumed samples: 7872 | elapsed time per iteration (ms): 92055.5 | throughput per GPU (TFLOP/s/GPU): 83.7 | learning rate: 4.889701E-06 | global batch size: 64 | lm loss: 7.023684E-01 | loss scale: 1.0 | grad norm: 27.807 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555dee100dc0] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x555dee588f00] mmco: unref short failure +[h264 @ 0x555dee588f00] mmco: unref short failure +[h264 @ 0x555dee100dc0] mmco: unref short failure +[h264 @ 0x555dee100dc0] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x555dee100dc0] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x555dee588f00] mmco: unref short failure +[h264 @ 0x555dee588f00] mmco: unref short failure +[h264 @ 0x555dee100dc0] mmco: unref short failure +[h264 @ 0x555dee100dc0] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x555dee100dc0] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x555dee100dc0] mmco: unref short failure +[h264 @ 0x555dee100dc0] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure + [2024-11-27 16:05:44] iteration 124/ 1000 | consumed samples: 7936 | elapsed time per iteration (ms): 115893.3 | throughput per GPU (TFLOP/s/GPU): 66.5 | learning rate: 4.887334E-06 | global batch size: 64 | lm loss: 7.402400E-01 | loss scale: 1.0 | grad norm: 1.045 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x55d955b8bcc0] mmco: unref short failure +[h264 @ 0x555defd60880] mmco: unref short failure +[h264 @ 0x55d955b8bcc0] mmco: unref short failure +[h264 @ 0x555defd60880] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x55d9565f0700] mmco: unref short failure +[h264 @ 0x555df32c1f40] mmco: unref short failure +[h264 @ 0x55d9565f0700] mmco: unref short failure +[h264 @ 0x55d9565f0700] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x555ded0b5480] mmco: unref short failure +[h264 @ 0x55d956386f00] mmco: unref short failure + [2024-11-27 16:07:07] iteration 125/ 1000 | consumed samples: 8000 | elapsed time per iteration (ms): 83072.7 | throughput per GPU (TFLOP/s/GPU): 92.8 | learning rate: 4.884944E-06 | global batch size: 64 | lm loss: 7.056447E-01 | loss scale: 1.0 | grad norm: 0.763 | number of skipped iterations: 0 | number of nan iterations: 0 | + [2024-11-27 16:08:45] iteration 126/ 1000 | consumed samples: 8064 | elapsed time per iteration (ms): 97614.9 | throughput per GPU (TFLOP/s/GPU): 79.0 | learning rate: 4.882528E-06 | global batch size: 64 | lm loss: 7.042141E-01 | loss scale: 1.0 | grad norm: 1.142 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555df32c1f40] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x55d9568d0900] mmco: unref short failure +[h264 @ 0x555defd60880] mmco: unref short failure +[h264 @ 0x55d955e4acc0] mmco: unref short failure +[h264 @ 0x555ded679600] mmco: unref short failure +[h264 @ 0x55d9568d0900] mmco: unref short failure +[h264 @ 0x55d9568d0900] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x555dee100dc0] mmco: unref short failure +[h264 @ 0x555dee100dc0] mmco: unref short failure +[h264 @ 0x555defd60880] mmco: unref short failure +[h264 @ 0x555defd60880] mmco: unref short failure +[h264 @ 0x55d955acf300] [h264 @ 0x555dedfc6580] mmco: unref short failure +mmco: unref short failure +[h264 @ 0x55d955acf300] [h264 @ 0x555dedfc6580] mmco: unref short failure +mmco: unref short failure +[h264 @ 0x55d955e4acc0] mmco: unref short failure +[h264 @ 0x55d955e4acc0] mmco: unref short failure +[h264 @ 0x555ded679600] mmco: unref short failure +[h264 @ 0x555ded679600] mmco: unref short failure +[h264 @ 0x55d95873d100] mmco: unref short failure +[h264 @ 0x55d95873d100] mmco: unref short failure +[h264 @ 0x555df1cb0600] mmco: unref short failure +[h264 @ 0x555df1cb0600] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x555dedfc6580] mmco: unref short failure +[h264 @ 0x555dedfc6580] mmco: unref short failure +[h264 @ 0x55d95873d100] mmco: unref short failure +[h264 @ 0x555df1cb0600] mmco: unref short failure +[h264 @ 0x555defd60880] mmco: unref short failure +[h264 @ 0x555defd60880] mmco: unref short failure +[h264 @ 0x55d955b8bcc0] mmco: unref short failure +[h264 @ 0x55d955b8bcc0] mmco: unref short failure +[h264 @ 0x555defd60880] mmco: unref short failure +[h264 @ 0x55d955b8bcc0] mmco: unref short failure + [2024-11-27 16:10:15] iteration 127/ 1000 | consumed samples: 8128 | elapsed time per iteration (ms): 89749.7 | throughput per GPU (TFLOP/s/GPU): 85.9 | learning rate: 4.880088E-06 | global batch size: 64 | lm loss: 7.120880E-01 | loss scale: 1.0 | grad norm: 0.859 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x55d955e4acc0] mmco: unref short failure +[h264 @ 0x55d955e4acc0] mmco: unref short failure +[h264 @ 0x555ded679600] mmco: unref short failure +[h264 @ 0x555ded679600] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x555df32c1f40] mmco: unref short failure +[h264 @ 0x555df32c1f40] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x555df32c1f40] mmco: unref short failure +[h264 @ 0x555df32c1f40] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x555df32c1f40] mmco: unref short failure +[h264 @ 0x555df32c1f40] mmco: unref short failure +[h264 @ 0x55d955e4acc0] mmco: unref short failure +[h264 @ 0x55d955e4acc0] mmco: unref short failure +[h264 @ 0x555ded679600] mmco: unref short failure +[h264 @ 0x555ded679600] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x555df32c1f40] mmco: unref short failure +[h264 @ 0x555df32c1f40] mmco: unref short failure + [2024-11-27 16:11:50] iteration 128/ 1000 | consumed samples: 8192 | elapsed time per iteration (ms): 95187.7 | throughput per GPU (TFLOP/s/GPU): 81.0 | learning rate: 4.877624E-06 | global batch size: 64 | lm loss: 6.666554E-01 | loss scale: 1.0 | grad norm: 1.103 | number of skipped iterations: 0 | number of nan iterations: 0 | + [2024-11-27 16:13:19] iteration 129/ 1000 | consumed samples: 8256 | elapsed time per iteration (ms): 89302.8 | throughput per GPU (TFLOP/s/GPU): 86.3 | learning rate: 4.875136E-06 | global batch size: 64 | lm loss: 6.939200E-01 | loss scale: 1.0 | grad norm: 0.925 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555defd60880] mmco: unref short failure +[h264 @ 0x555defd60880] mmco: unref short failure +[h264 @ 0x55d9568d0900] mmco: unref short failure +[h264 @ 0x55d9568d0900] mmco: unref short failure +[h264 @ 0x555defd60880] mmco: unref short failure +[h264 @ 0x555defd60880] mmco: unref short failure +[h264 @ 0x55d9568d0900] mmco: unref short failure +[h264 @ 0x55d9568d0900] mmco: unref short failure +[h264 @ 0x555ded679600] mmco: unref short failure +[h264 @ 0x55d955e4acc0] mmco: unref short failure +[h264 @ 0x555defd60880] mmco: unref short failure +[h264 @ 0x55d9568d0900] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x555df32c1f40] mmco: unref short failure +[h264 @ 0x555df32c1f40] mmco: unref short failure +[h264 @ 0x555df32c1f40] mmco: unref short failure +[h264 @ 0x55d95cbcf940] mmco: unref short failure +[h264 @ 0x55d95cbcf940] mmco: unref short failure +[h264 @ 0x555dedfc6580] mmco: unref short failure +[h264 @ 0x555dedfc6580] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x555dedfc6580] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x555dedfc6580] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure + [2024-11-27 16:14:34] iteration 130/ 1000 | consumed samples: 8320 | elapsed time per iteration (ms): 75267.7 | throughput per GPU (TFLOP/s/GPU): 102.4 | learning rate: 4.872622E-06 | global batch size: 64 | lm loss: 6.710307E-01 | loss scale: 1.0 | grad norm: 0.954 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555dedfc6580] mmco: unref short failure +[h264 @ 0x555dedfc6580] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x555dedfc6580] mmco: unref short failure +[h264 @ 0x555dedfc6580] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure + [2024-11-27 16:16:04] iteration 131/ 1000 | consumed samples: 8384 | elapsed time per iteration (ms): 89805.7 | throughput per GPU (TFLOP/s/GPU): 85.8 | learning rate: 4.870085E-06 | global batch size: 64 | lm loss: 6.678811E-01 | loss scale: 1.0 | grad norm: 0.901 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x55d956ee5780] mmco: unref short failure +[h264 @ 0x555dec9973c0] mmco: unref short failure + [2024-11-27 16:18:09] iteration 132/ 1000 | consumed samples: 8448 | elapsed time per iteration (ms): 124949.2 | throughput per GPU (TFLOP/s/GPU): 61.7 | learning rate: 4.867523E-06 | global batch size: 64 | lm loss: 7.045669E-01 | loss scale: 1.0 | grad norm: 0.851 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x55d956ee5780] mmco: unref short failure +[h264 @ 0x55d956ee5780] mmco: unref short failure +[h264 @ 0x555ded679600] mmco: unref short failure +[h264 @ 0x55d955e4acc0] mmco: unref short failure +[h264 @ 0x555dee100dc0] mmco: unref short failure +[h264 @ 0x555dee100dc0] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x555dee100dc0] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x555dee588f00] mmco: unref short failure +[h264 @ 0x55d956f98f80] mmco: unref short failure +[h264 @ 0x555dedfc6580] mmco: unref short failure +[h264 @ 0x555dedfc6580] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure + [2024-11-27 16:19:25] iteration 133/ 1000 | consumed samples: 8512 | elapsed time per iteration (ms): 75527.8 | throughput per GPU (TFLOP/s/GPU): 102.1 | learning rate: 4.864937E-06 | global batch size: 64 | lm loss: 6.984726E-01 | loss scale: 1.0 | grad norm: 1.503 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555dedfc6580] mmco: unref short failure +[h264 @ 0x555dedfc6580] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x555dee100dc0] mmco: unref short failure +[h264 @ 0x555dee100dc0] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x555dee100dc0] mmco: unref short failure +[h264 @ 0x555dee100dc0] mmco: unref short failure +[h264 @ 0x555ded679600] mmco: unref short failure +[h264 @ 0x555dedfc6580] mmco: unref short failure +[h264 @ 0x555dedfc6580] mmco: unref short failure +[h264 @ 0x55d955e4acc0] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x55d9565f0700] mmco: unref short failure + [2024-11-27 16:21:20] iteration 134/ 1000 | consumed samples: 8576 | elapsed time per iteration (ms): 115224.9 | throughput per GPU (TFLOP/s/GPU): 66.9 | learning rate: 4.862327E-06 | global batch size: 64 | lm loss: 7.181825E-01 | loss scale: 1.0 | grad norm: 0.871 | number of skipped iterations: 0 | number of nan iterations: 0 | + [2024-11-27 16:23:06] iteration 135/ 1000 | consumed samples: 8640 | elapsed time per iteration (ms): 106079.9 | throughput per GPU (TFLOP/s/GPU): 72.7 | learning rate: 4.859692E-06 | global batch size: 64 | lm loss: 7.264897E-01 | loss scale: 1.0 | grad norm: 0.888 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555ded679600] mmco: unref short failure +[h264 @ 0x555ded679600] mmco: unref short failure +[h264 @ 0x55d955e4acc0] mmco: unref short failure +[h264 @ 0x55d955e4acc0] mmco: unref short failure +[h264 @ 0x555dece38b80] mmco: unref short failure +[h264 @ 0x555ded679600] mmco: unref short failure +[h264 @ 0x555ded679600] mmco: unref short failure +[h264 @ 0x55d956f36840] mmco: unref short failure +[h264 @ 0x55d955e4acc0] mmco: unref short failure +[h264 @ 0x55d955e4acc0] mmco: unref short failure +[h264 @ 0x555dedfc6580] mmco: unref short failure +[h264 @ 0x555dedfc6580] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x555dedfc6580] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x555dedfc6580] mmco: unref short failure +[h264 @ 0x555dedfc6580] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure + [2024-11-27 16:24:21] iteration 136/ 1000 | consumed samples: 8704 | elapsed time per iteration (ms): 75408.5 | throughput per GPU (TFLOP/s/GPU): 102.2 | learning rate: 4.857033E-06 | global batch size: 64 | lm loss: 6.576435E-01 | loss scale: 1.0 | grad norm: 0.825 | number of skipped iterations: 0 | number of nan iterations: 0 | +processed_samples 500 unjoint_samples 500 joint_samples 35 [105962, 100832] +processed_samples 500 unjoint_samples 500 joint_samples 35 [105962, 100832] +processed_samples 500 unjoint_samples 500 joint_samples 33 [20193, 118380] +processed_samples 500 unjoint_samples 500 joint_samples 33 [20193, 118380] +processed_samples 500 unjoint_samples 500 joint_samples 29 [106623, 110615] +processed_samples 500 unjoint_samples 500 joint_samples 29 [106623, 110615] +processed_samples 500 unjoint_samples 500 joint_samples 39 [92083, 124515] +processed_samples 500 unjoint_samples 500 joint_samples 39 [92083, 124515] +processed_samples 500 unjoint_samples 500 joint_samples 31 [119429, 119622] +processed_samples 500 unjoint_samples 500 joint_samples 31 [119429, 119622] +processed_samples 500 unjoint_samples 500 joint_samples 34 [121945, 117011] +processed_samples 500 unjoint_samples 500 joint_samples 34 [121945, 117011] +processed_samples 500 unjoint_samples 500 joint_samples 33 [106663, 103375] +processed_samples 500 unjoint_samples 500 joint_samples 33 [106663, 103375] +processed_samples 500 unjoint_samples 500 joint_samples 38 [63985, 122137] +processed_samples 500 unjoint_samples 500 joint_samples 38 [63985, 122137] +[h264 @ 0x555ded679600] mmco: unref short failure +[h264 @ 0x55d955e4acc0] mmco: unref short failure +[h264 @ 0x555deee52400] mmco: unref short failure +[h264 @ 0x55d956f36840] mmco: unref short failure + [2024-11-27 16:25:45] iteration 137/ 1000 | consumed samples: 8768 | elapsed time per iteration (ms): 83158.4 | throughput per GPU (TFLOP/s/GPU): 92.7 | learning rate: 4.854350E-06 | global batch size: 64 | lm loss: 7.173734E-01 | loss scale: 1.0 | grad norm: 0.760 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x555dee100dc0] mmco: unref short failure +[h264 @ 0x555dee100dc0] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x555dee100dc0] mmco: unref short failure +[h264 @ 0x555dee100dc0] mmco: unref short failure +[h264 @ 0x555dee100dc0] mmco: unref short failure +[h264 @ 0x555dee100dc0] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x555dee100dc0] mmco: unref short failure +[h264 @ 0x555dee100dc0] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x555dee100dc0] mmco: unref short failure +[h264 @ 0x555dee100dc0] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x555dee100dc0] mmco: unref short failure +[h264 @ 0x555dee100dc0] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x555dee100dc0] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x555dee100dc0] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x555dee100dc0] mmco: unref short failure +[h264 @ 0x555dedfc6580] mmco: unref short failure +[h264 @ 0x55d956767080] mmco: unref short failure + [2024-11-27 16:27:18] iteration 138/ 1000 | consumed samples: 8832 | elapsed time per iteration (ms): 92897.0 | throughput per GPU (TFLOP/s/GPU): 83.0 | learning rate: 4.851643E-06 | global batch size: 64 | lm loss: 6.300446E-01 | loss scale: 1.0 | grad norm: 2.823 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555defd60880] mmco: unref short failure +[h264 @ 0x555defd60880] mmco: unref short failure +[h264 @ 0x55d9568d0900] mmco: unref short failure +[h264 @ 0x55d9568d0900] mmco: unref short failure + [2024-11-27 16:28:34] iteration 139/ 1000 | consumed samples: 8896 | elapsed time per iteration (ms): 76658.4 | throughput per GPU (TFLOP/s/GPU): 100.6 | learning rate: 4.848912E-06 | global batch size: 64 | lm loss: 6.692375E-01 | loss scale: 1.0 | grad norm: 0.827 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x55d959b29f40] mmco: unref short failure +[h264 @ 0x555ded679600] mmco: unref short failure +[h264 @ 0x55d955e4acc0] mmco: unref short failure + [2024-11-27 16:30:17] iteration 140/ 1000 | consumed samples: 8960 | elapsed time per iteration (ms): 102415.8 | throughput per GPU (TFLOP/s/GPU): 75.3 | learning rate: 4.846156E-06 | global batch size: 64 | lm loss: 6.938949E-01 | loss scale: 1.0 | grad norm: 1.022 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x55d95873d100] mmco: unref short failure + [2024-11-27 16:31:41] iteration 141/ 1000 | consumed samples: 9024 | elapsed time per iteration (ms): 84222.7 | throughput per GPU (TFLOP/s/GPU): 91.5 | learning rate: 4.843377E-06 | global batch size: 64 | lm loss: 7.084374E-01 | loss scale: 1.0 | grad norm: 0.882 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555deee52400] mmco: unref short failure +[h264 @ 0x55d956f36840] mmco: unref short failure +[h264 @ 0x55d956f36840] mmco: unref short failure +[h264 @ 0x55d956f36840] mmco: unref short failure +[h264 @ 0x555deee52400] mmco: unref short failure +[h264 @ 0x555deee52400] mmco: unref short failure +[h264 @ 0x55d956f36840] mmco: unref short failure +[h264 @ 0x55d956f36840] mmco: unref short failure +[h264 @ 0x555deee52400] mmco: unref short failure +[h264 @ 0x555deee52400] mmco: unref short failure +[h264 @ 0x55d956f36840] mmco: unref short failure +[h264 @ 0x555deee52400] mmco: unref short failure +[h264 @ 0x55d956f36840] mmco: unref short failure +[h264 @ 0x55d956f36840] mmco: unref short failure +[h264 @ 0x555deee52400] mmco: unref short failure +[h264 @ 0x555deee52400] mmco: unref short failure +[h264 @ 0x55d956f36840] mmco: unref short failure +[h264 @ 0x555deee52400] mmco: unref short failure +[h264 @ 0x55d956f36840] mmco: unref short failure +[h264 @ 0x555deee52400] mmco: unref short failure +[h264 @ 0x55d956f36840] mmco: unref short failure +[h264 @ 0x55d956f36840] mmco: unref short failure +[h264 @ 0x555deee52400] mmco: unref short failure +[h264 @ 0x555deee52400] mmco: unref short failure +[h264 @ 0x555ded679600] mmco: unref short failure +[h264 @ 0x555ded679600] mmco: unref short failure +[h264 @ 0x55d956da5240] mmco: unref short failure +[h264 @ 0x55d956da5240] mmco: unref short failure +[h264 @ 0x555ded679600] mmco: unref short failure +[h264 @ 0x555ded679600] mmco: unref short failure +[h264 @ 0x55d956da5240] mmco: unref short failure +[h264 @ 0x55d956da5240] mmco: unref short failure +[h264 @ 0x55d956da5240] mmco: unref short failure +[h264 @ 0x55d956da5240] mmco: unref short failure +[h264 @ 0x555ded679600] mmco: unref short failure +[h264 @ 0x555ded679600] mmco: unref short failure + [2024-11-27 16:33:20] iteration 142/ 1000 | consumed samples: 9088 | elapsed time per iteration (ms): 99273.9 | throughput per GPU (TFLOP/s/GPU): 77.6 | learning rate: 4.840573E-06 | global batch size: 64 | lm loss: 6.900647E-01 | loss scale: 1.0 | grad norm: 0.764 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555deee52400] mmco: unref short failure +[h264 @ 0x555deee52400] mmco: unref short failure +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x55d956f36840] mmco: unref short failure +[h264 @ 0x55d956f36840] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x555dedfc6580] mmco: unref short failure +[h264 @ 0x55d956767080] mmco: unref short failure +[h264 @ 0x555dedfc6580] mmco: unref short failure +[h264 @ 0x555dedfc6580] mmco: unref short failure +[h264 @ 0x55d956767080] mmco: unref short failure +[h264 @ 0x55d956767080] mmco: unref short failure +[h264 @ 0x555dedfc6580] mmco: unref short failure +[h264 @ 0x555dedfc6580] mmco: unref short failure +[h264 @ 0x55d956767080] mmco: unref short failure +[h264 @ 0x55d956767080] mmco: unref short failure + [2024-11-27 16:35:40] iteration 143/ 1000 | consumed samples: 9152 | elapsed time per iteration (ms): 139672.6 | throughput per GPU (TFLOP/s/GPU): 55.2 | learning rate: 4.837746E-06 | global batch size: 64 | lm loss: 7.882947E-01 | loss scale: 1.0 | grad norm: 0.916 | number of skipped iterations: 0 | number of nan iterations: 0 | + [2024-11-27 16:37:20] iteration 144/ 1000 | consumed samples: 9216 | elapsed time per iteration (ms): 99927.0 | throughput per GPU (TFLOP/s/GPU): 77.1 | learning rate: 4.834894E-06 | global batch size: 64 | lm loss: 6.919404E-01 | loss scale: 1.0 | grad norm: 0.899 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555deee52400] mmco: unref short failure +[h264 @ 0x555deee52400] mmco: unref short failure +[h264 @ 0x555deee52400] mmco: unref short failure +[h264 @ 0x555deee52400] mmco: unref short failure +[h264 @ 0x55d956f36840] mmco: unref short failure +[h264 @ 0x55d956f36840] mmco: unref short failure +[h264 @ 0x55d956f36840] mmco: unref short failure +[h264 @ 0x55d956f36840] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x55d957a03080] mmco: unref short failure +[h264 @ 0x55d957a03080] mmco: unref short failure + [2024-11-27 16:38:31] iteration 145/ 1000 | consumed samples: 9280 | elapsed time per iteration (ms): 71577.1 | throughput per GPU (TFLOP/s/GPU): 107.7 | learning rate: 4.832018E-06 | global batch size: 64 | lm loss: 7.083249E-01 | loss scale: 1.0 | grad norm: 0.931 | number of skipped iterations: 0 | number of nan iterations: 0 | + [2024-11-27 16:39:49] iteration 146/ 1000 | consumed samples: 9344 | elapsed time per iteration (ms): 77590.9 | throughput per GPU (TFLOP/s/GPU): 99.3 | learning rate: 4.829119E-06 | global batch size: 64 | lm loss: 6.647974E-01 | loss scale: 1.0 | grad norm: 0.815 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555dedfc6580] mmco: unref short failure +[h264 @ 0x55d956767080] mmco: unref short failure +[h264 @ 0x555dedfc6580] mmco: unref short failure +[h264 @ 0x55d956767080] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x55d959cd93c0] mmco: unref short failure +[h264 @ 0x55d959cd93c0] mmco: unref short failure +[h264 @ 0x555ded679600] mmco: unref short failure +[h264 @ 0x555ded679600] mmco: unref short failure +[h264 @ 0x55d956da5240] mmco: unref short failure +[h264 @ 0x55d956da5240] mmco: unref short failure +[h264 @ 0x555ded679600] mmco: unref short failure +[h264 @ 0x55d955e4acc0] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure + [2024-11-27 16:41:27] iteration 147/ 1000 | consumed samples: 9408 | elapsed time per iteration (ms): 97953.2 | throughput per GPU (TFLOP/s/GPU): 78.7 | learning rate: 4.826195E-06 | global batch size: 64 | lm loss: 6.562359E-01 | loss scale: 1.0 | grad norm: 1.320 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555ded679600] mmco: unref short failure +[h264 @ 0x55d956da5240] mmco: unref short failure +[h264 @ 0x55d956da5240] mmco: unref short failure +[h264 @ 0x55d956da5240] mmco: unref short failure +[h264 @ 0x555ded679600] mmco: unref short failure +[h264 @ 0x555ded679600] mmco: unref short failure +[h264 @ 0x55d956da5240] mmco: unref short failure +[h264 @ 0x555ded679600] mmco: unref short failure +[h264 @ 0x55d955e4acc0] mmco: unref short failure +[h264 @ 0x555ded679600] mmco: unref short failure +[h264 @ 0x555ded679600] mmco: unref short failure +[h264 @ 0x55d955e4acc0] mmco: unref short failure +[h264 @ 0x55d956da5240] mmco: unref short failure +[h264 @ 0x55d956da5240] mmco: unref short failure +[h264 @ 0x555ded679600] mmco: unref short failure +[h264 @ 0x555ded679600] mmco: unref short failure +[h264 @ 0x55d955e4acc0] mmco: unref short failure +[h264 @ 0x555ded679600] mmco: unref short failure +[h264 @ 0x555df3e6f800] mmco: unref short failure +[h264 @ 0x555df3e6f800] mmco: unref short failure +[h264 @ 0x55d95c76c1c0] mmco: unref short failure +[h264 @ 0x55d95c76c1c0] mmco: unref short failure + [2024-11-27 16:42:54] iteration 148/ 1000 | consumed samples: 9472 | elapsed time per iteration (ms): 87479.2 | throughput per GPU (TFLOP/s/GPU): 88.1 | learning rate: 4.823248E-06 | global batch size: 64 | lm loss: 6.847668E-01 | loss scale: 1.0 | grad norm: 0.866 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555ded679600] mmco: unref short failure +[h264 @ 0x555ded679600] mmco: unref short failure +[h264 @ 0x55d955d53340] mmco: unref short failure +[h264 @ 0x55d955d53340] mmco: unref short failure +[h264 @ 0x555ded679600] mmco: unref short failure +[h264 @ 0x55d955d53340] mmco: unref short failure +[h264 @ 0x555ded679600] mmco: unref short failure +[h264 @ 0x555ded679600] mmco: unref short failure +[h264 @ 0x55d955d53340] mmco: unref short failure +[h264 @ 0x55d955d53340] mmco: unref short failure +[h264 @ 0x555ded679600] mmco: unref short failure +[h264 @ 0x55d955d53340] mmco: unref short failure +[h264 @ 0x555ded679600] mmco: unref short failure +[h264 @ 0x555ded679600] mmco: unref short failure +[h264 @ 0x55d955d53340] mmco: unref short failure +[h264 @ 0x55d955d53340] mmco: unref short failure +[h264 @ 0x555ded679600] mmco: unref short failure +[h264 @ 0x555ded679600] mmco: unref short failure +[h264 @ 0x55d955d53340] mmco: unref short failure +[h264 @ 0x55d955d53340] mmco: unref short failure + [2024-11-27 16:44:16] iteration 149/ 1000 | consumed samples: 9536 | elapsed time per iteration (ms): 81834.8 | throughput per GPU (TFLOP/s/GPU): 94.2 | learning rate: 4.820277E-06 | global batch size: 64 | lm loss: 6.775054E-01 | loss scale: 1.0 | grad norm: 3.057 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x55d9568d0900] [h264 @ 0x555defd60880] mmco: unref short failure +mmco: unref short failure +[h264 @ 0x55d9568d0900] mmco: unref short failure +[h264 @ 0x555defd60880] mmco: unref short failure +[h264 @ 0x55d95c76c1c0] mmco: unref short failure +[h264 @ 0x555df3e6f800] mmco: unref short failure +[h264 @ 0x55d959cd93c0] mmco: unref short failure +[h264 @ 0x55d959cd93c0] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x55d95c76c1c0] mmco: unref short failure +[h264 @ 0x55d95c76c1c0] mmco: unref short failure +[h264 @ 0x555df3e6f800] mmco: unref short failure +[h264 @ 0x555df3e6f800] mmco: unref short failure +[h264 @ 0x555defd60880] mmco: unref short failure +[h264 @ 0x55d9568d0900] mmco: unref short failure +[h264 @ 0x55d959cd93c0] mmco: unref short failure +[h264 @ 0x55d959cd93c0] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x55d959cd93c0] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x555defd60880] mmco: unref short failure +[h264 @ 0x555defd60880] mmco: unref short failure +[h264 @ 0x55d9568d0900] mmco: unref short failure +[h264 @ 0x55d9568d0900] mmco: unref short failure +[h264 @ 0x555defd60880] mmco: unref short failure +[h264 @ 0x555defd60880] mmco: unref short failure +[h264 @ 0x55d9568d0900] mmco: unref short failure +[h264 @ 0x55d9568d0900] mmco: unref short failure +[h264 @ 0x55d95c76c1c0] mmco: unref short failure +[h264 @ 0x555df3e6f800] mmco: unref short failure +[h264 @ 0x55d95c76c1c0] mmco: unref short failure +[h264 @ 0x55d95c76c1c0] mmco: unref short failure +[h264 @ 0x55d95c76c1c0] mmco: unref short failure +[h264 @ 0x555df3e6f800] mmco: unref short failure +[h264 @ 0x555df3e6f800] mmco: unref short failure +[h264 @ 0x555df3e6f800] mmco: unref short failure +[h264 @ 0x555dedfc6580] mmco: unref short failure +[h264 @ 0x555dedfc6580] mmco: unref short failure +[h264 @ 0x55d956767080] mmco: unref short failure +[h264 @ 0x55d956767080] mmco: unref short failure + [2024-11-27 16:45:27] iteration 150/ 1000 | consumed samples: 9600 | elapsed time per iteration (ms): 70827.7 | throughput per GPU (TFLOP/s/GPU): 108.8 | learning rate: 4.817282E-06 | global batch size: 64 | lm loss: 7.231870E-01 | loss scale: 1.0 | grad norm: 0.991 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555dedfc6580] mmco: unref short failure +[h264 @ 0x555dedfc6580] mmco: unref short failure +[h264 @ 0x55d956767080] mmco: unref short failure +[h264 @ 0x55d956767080] mmco: unref short failure +[h264 @ 0x555dedfc6580] mmco: unref short failure +[h264 @ 0x555dedfc6580] mmco: unref short failure +[h264 @ 0x55d956767080] mmco: unref short failure +[h264 @ 0x55d956767080] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x55d9567ebec0] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x55d9567ebec0] mmco: unref short failure +[h264 @ 0x55d9567ebec0] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x55d9567ebec0] mmco: unref short failure +[h264 @ 0x55d9567ebec0] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x55d9567ebec0] mmco: unref short failure +[h264 @ 0x55d9567ebec0] mmco: unref short failure + [2024-11-27 16:47:26] iteration 151/ 1000 | consumed samples: 9664 | elapsed time per iteration (ms): 119267.8 | throughput per GPU (TFLOP/s/GPU): 64.6 | learning rate: 4.814263E-06 | global batch size: 64 | lm loss: 6.249514E-01 | loss scale: 1.0 | grad norm: 1.199 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555ded679600] mmco: unref short failure +[h264 @ 0x55d955d53340] mmco: unref short failure +[h264 @ 0x555defd60880] mmco: unref short failure +[h264 @ 0x55d9568d0900] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x55d959cd93c0] mmco: unref short failure +[h264 @ 0x555ded679600] mmco: unref short failure +[h264 @ 0x55d955d53340] mmco: unref short failure +[h264 @ 0x555defd60880] mmco: unref short failure +[h264 @ 0x55d9568d0900] mmco: unref short failure +[h264 @ 0x555ded679600] mmco: unref short failure +[h264 @ 0x55d955d53340] mmco: unref short failure +[h264 @ 0x555ded679600] mmco: unref short failure +[h264 @ 0x55d955d53340] mmco: unref short failure +[h264 @ 0x555defd60880] mmco: unref short failure +[h264 @ 0x55d9568d0900] mmco: unref short failure +[h264 @ 0x555defd60880] mmco: unref short failure +[h264 @ 0x555defd60880] mmco: unref short failure +[h264 @ 0x55d9568d0900] mmco: unref short failure +[h264 @ 0x55d9568d0900] mmco: unref short failure + [2024-11-27 16:48:42] iteration 152/ 1000 | consumed samples: 9728 | elapsed time per iteration (ms): 75657.6 | throughput per GPU (TFLOP/s/GPU): 101.9 | learning rate: 4.811221E-06 | global batch size: 64 | lm loss: 7.830349E-01 | loss scale: 1.0 | grad norm: 3.815 | number of skipped iterations: 0 | number of nan iterations: 0 | + [2024-11-27 16:50:09] iteration 153/ 1000 | consumed samples: 9792 | elapsed time per iteration (ms): 86625.1 | throughput per GPU (TFLOP/s/GPU): 89.0 | learning rate: 4.808155E-06 | global batch size: 64 | lm loss: 6.208155E-01 | loss scale: 1.0 | grad norm: 0.803 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555def251b40] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x55d955d24640] mmco: unref short failure +[h264 @ 0x555df1cb0600] mmco: unref short failure +[h264 @ 0x555df1cb0600] mmco: unref short failure +[h264 @ 0x55d95873d100] mmco: unref short failure +[h264 @ 0x55d95873d100] mmco: unref short failure +[h264 @ 0x555df1cb0600] mmco: unref short failure +[h264 @ 0x555df1cb0600] mmco: unref short failure +[h264 @ 0x55d95873d100] mmco: unref short failure +[h264 @ 0x55d95873d100] mmco: unref short failure +[h264 @ 0x555df1cb0600] mmco: unref short failure +[h264 @ 0x555df1cb0600] mmco: unref short failure +[h264 @ 0x55d95873d100] mmco: unref short failure +[h264 @ 0x55d95873d100] mmco: unref short failure +[h264 @ 0x555df1cb0600] mmco: unref short failure +[h264 @ 0x555df1cb0600] mmco: unref short failure +[h264 @ 0x55d95873d100] mmco: unref short failure +[h264 @ 0x55d95873d100] mmco: unref short failure +[h264 @ 0x555df1cb0600] mmco: unref short failure +[h264 @ 0x555df1cb0600] mmco: unref short failure +[h264 @ 0x55d95873d100] mmco: unref short failure +[h264 @ 0x55d95873d100] mmco: unref short failure +[h264 @ 0x555df1cb0600] mmco: unref short failure +[h264 @ 0x555df1cb0600] mmco: unref short failure +[h264 @ 0x55d95873d100] mmco: unref short failure +[h264 @ 0x55d95873d100] mmco: unref short failure +[h264 @ 0x555df1cb0600] mmco: unref short failure +[h264 @ 0x555df1cb0600] mmco: unref short failure +[h264 @ 0x55d95873d100] mmco: unref short failure +[h264 @ 0x55d95873d100] mmco: unref short failure +[h264 @ 0x555df1cb0600] mmco: unref short failure +[h264 @ 0x555df1cb0600] mmco: unref short failure +[h264 @ 0x55d95873d100] mmco: unref short failure +[h264 @ 0x55d95873d100] mmco: unref short failure +[h264 @ 0x555df1cb0600] mmco: unref short failure +[h264 @ 0x555df1cb0600] mmco: unref short failure +[h264 @ 0x55d95873d100] mmco: unref short failure +[h264 @ 0x55d95873d100] mmco: unref short failure + [2024-11-27 16:51:29] iteration 154/ 1000 | consumed samples: 9856 | elapsed time per iteration (ms): 79964.0 | throughput per GPU (TFLOP/s/GPU): 96.4 | learning rate: 4.805065E-06 | global batch size: 64 | lm loss: 7.659938E-01 | loss scale: 1.0 | grad norm: 0.992 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x55d957a1cec0] mmco: unref short failure +[h264 @ 0x555dedf05880] mmco: unref short failure + [2024-11-27 16:53:04] iteration 155/ 1000 | consumed samples: 9920 | elapsed time per iteration (ms): 95631.4 | throughput per GPU (TFLOP/s/GPU): 80.6 | learning rate: 4.801951E-06 | global batch size: 64 | lm loss: 6.927797E-01 | loss scale: 1.0 | grad norm: 0.983 | number of skipped iterations: 0 | number of nan iterations: 0 | + [2024-11-27 16:54:17] iteration 156/ 1000 | consumed samples: 9984 | elapsed time per iteration (ms): 72721.3 | throughput per GPU (TFLOP/s/GPU): 106.0 | learning rate: 4.798814E-06 | global batch size: 64 | lm loss: 7.079293E-01 | loss scale: 1.0 | grad norm: 8.416 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x55d955d53340] mmco: unref short failure +[h264 @ 0x55d955d53340] mmco: unref short failure +[h264 @ 0x555ded679600] mmco: unref short failure +[h264 @ 0x555ded679600] mmco: unref short failure +[h264 @ 0x555ded679600] mmco: unref short failure +[h264 @ 0x555ded679600] mmco: unref short failure +[h264 @ 0x55d955e4acc0] mmco: unref short failure +[h264 @ 0x55d955e4acc0] mmco: unref short failure +[h264 @ 0x555ded679600] mmco: unref short failure +[h264 @ 0x555df3e6f800] mmco: unref short failure +[h264 @ 0x555df3e6f800] mmco: unref short failure +[h264 @ 0x555ded679600] mmco: unref short failure +[h264 @ 0x55d955e4acc0] mmco: unref short failure +[h264 @ 0x55d955e4acc0] mmco: unref short failure +[h264 @ 0x55d95c76c1c0] mmco: unref short failure +[h264 @ 0x55d95c76c1c0] mmco: unref short failure +[h264 @ 0x555ded679600] mmco: unref short failure +[h264 @ 0x555ded679600] mmco: unref short failure +[h264 @ 0x55d955d53340] mmco: unref short failure +[h264 @ 0x55d955d53340] mmco: unref short failure + [2024-11-27 16:55:37] iteration 157/ 1000 | consumed samples: 10048 | elapsed time per iteration (ms): 79895.5 | throughput per GPU (TFLOP/s/GPU): 96.5 | learning rate: 4.795653E-06 | global batch size: 64 | lm loss: 6.731150E-01 | loss scale: 1.0 | grad norm: 0.826 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x555df3e6f800] mmco: unref short failure +[h264 @ 0x555df3e6f800] mmco: unref short failure +[h264 @ 0x55d95c76c1c0] mmco: unref short failure +[h264 @ 0x55d95c76c1c0] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x55d957f0f640] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x55d957f0f640] mmco: unref short failure + [2024-11-27 16:57:34] iteration 158/ 1000 | consumed samples: 10112 | elapsed time per iteration (ms): 116763.1 | throughput per GPU (TFLOP/s/GPU): 66.0 | learning rate: 4.792469E-06 | global batch size: 64 | lm loss: 7.790256E-01 | loss scale: 1.0 | grad norm: 0.822 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555ded696c00] mmco: unref short failure +[h264 @ 0x555ded696c00] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x555df3e6f800] mmco: unref short failure +[h264 @ 0x555df3e6f800] mmco: unref short failure +[h264 @ 0x55d95c76c1c0] mmco: unref short failure +[h264 @ 0x55d95c76c1c0] mmco: unref short failure +[h264 @ 0x55d955d53340] mmco: unref short failure +[h264 @ 0x555ded679600] mmco: unref short failure + [2024-11-27 16:59:25] iteration 159/ 1000 | consumed samples: 10176 | elapsed time per iteration (ms): 111649.0 | throughput per GPU (TFLOP/s/GPU): 69.0 | learning rate: 4.789261E-06 | global batch size: 64 | lm loss: 7.043136E-01 | loss scale: 1.0 | grad norm: 0.877 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555ded679600] mmco: unref short failure +[h264 @ 0x55d955d53340] mmco: unref short failure +[h264 @ 0x555ded679600] mmco: unref short failure +[h264 @ 0x555ded679600] mmco: unref short failure +[h264 @ 0x55d955d53340] mmco: unref short failure +[h264 @ 0x55d955d53340] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x555ded696c00] mmco: unref short failure +[h264 @ 0x555ded696c00] mmco: unref short failure + [2024-11-27 17:01:01] iteration 160/ 1000 | consumed samples: 10240 | elapsed time per iteration (ms): 96076.4 | throughput per GPU (TFLOP/s/GPU): 80.2 | learning rate: 4.786030E-06 | global batch size: 64 | lm loss: 6.699425E-01 | loss scale: 1.0 | grad norm: 0.764 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555ded679600] mmco: unref short failure +[h264 @ 0x555ded679600] mmco: unref short failure +[h264 @ 0x55d955d53340] mmco: unref short failure +[h264 @ 0x55d955d53340] mmco: unref short failure +[h264 @ 0x555ded679600] mmco: unref short failure +[h264 @ 0x555ded679600] mmco: unref short failure +[h264 @ 0x55d955d53340] mmco: unref short failure +[h264 @ 0x55d955d53340] mmco: unref short failure +[h264 @ 0x555ded679600] mmco: unref short failure +[h264 @ 0x55d955d53340] mmco: unref short failure +[h264 @ 0x555ded679600] mmco: unref short failure +[h264 @ 0x55d955d53340] mmco: unref short failure +[h264 @ 0x555ded679600] mmco: unref short failure +[h264 @ 0x55d955d53340] mmco: unref short failure +[h264 @ 0x555ded679600] mmco: unref short failure +[h264 @ 0x555ded679600] mmco: unref short failure +[h264 @ 0x55d955d53340] mmco: unref short failure +[h264 @ 0x55d955d53340] mmco: unref short failure +[h264 @ 0x55d9560849c0] mmco: unref short failure +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x55d9560849c0] mmco: unref short failure +[h264 @ 0x55d9560849c0] mmco: unref short failure +[h264 @ 0x55d9560849c0] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x55d9560849c0] mmco: unref short failure +[h264 @ 0x55d9560849c0] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure + [2024-11-27 17:02:47] iteration 161/ 1000 | consumed samples: 10304 | elapsed time per iteration (ms): 105622.4 | throughput per GPU (TFLOP/s/GPU): 73.0 | learning rate: 4.782775E-06 | global batch size: 64 | lm loss: 6.520627E-01 | loss scale: 1.0 | grad norm: 1.032 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x55d955d53340] mmco: unref short failure +[h264 @ 0x55d955d53340] mmco: unref short failure +[h264 @ 0x555ded679600] mmco: unref short failure +[h264 @ 0x555ded679600] mmco: unref short failure +[h264 @ 0x55d955d53340] mmco: unref short failure +[h264 @ 0x55d955d53340] mmco: unref short failure +[h264 @ 0x555ded679600] mmco: unref short failure +[h264 @ 0x555ded679600] mmco: unref short failure +[h264 @ 0x55d955d53340] mmco: unref short failure +[h264 @ 0x55d955d53340] mmco: unref short failure +[h264 @ 0x555ded679600] mmco: unref short failure +[h264 @ 0x555ded679600] mmco: unref short failure +[h264 @ 0x55d955d53340] mmco: unref short failure +[h264 @ 0x555ded679600] mmco: unref short failure +[h264 @ 0x55d9560849c0] mmco: unref short failure +[h264 @ 0x55d9560849c0] mmco: unref short failure +[h264 @ 0x555dee1d4480] mmco: unref short failure +[h264 @ 0x555dee1d4480] mmco: unref short failure +[h264 @ 0x55d955d53340] mmco: unref short failure +[h264 @ 0x55d955d53340] mmco: unref short failure +[h264 @ 0x555ded679600] mmco: unref short failure +[h264 @ 0x555ded679600] mmco: unref short failure +[h264 @ 0x55d959b29f40] mmco: unref short failure +[h264 @ 0x55d959b29f40] mmco: unref short failure +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x55d955d53340] mmco: unref short failure +[h264 @ 0x55d955d53340] mmco: unref short failure +[h264 @ 0x555ded679600] mmco: unref short failure +[h264 @ 0x555ded679600] mmco: unref short failure + [2024-11-27 17:04:19] iteration 162/ 1000 | consumed samples: 10368 | elapsed time per iteration (ms): 91872.5 | throughput per GPU (TFLOP/s/GPU): 83.9 | learning rate: 4.779497E-06 | global batch size: 64 | lm loss: 7.085309E-01 | loss scale: 1.0 | grad norm: 0.866 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x55d955d4f640] mmco: unref short failure +[h264 @ 0x55d955d4f640] mmco: unref short failure +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x55d955d4f640] mmco: unref short failure + [2024-11-27 17:05:43] iteration 163/ 1000 | consumed samples: 10432 | elapsed time per iteration (ms): 83853.9 | throughput per GPU (TFLOP/s/GPU): 91.9 | learning rate: 4.776195E-06 | global batch size: 64 | lm loss: 7.403272E-01 | loss scale: 1.0 | grad norm: 0.945 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555ded696c00] mmco: unref short failure +[h264 @ 0x555ded696c00] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure + [2024-11-27 17:07:09] iteration 164/ 1000 | consumed samples: 10496 | elapsed time per iteration (ms): 85988.2 | throughput per GPU (TFLOP/s/GPU): 89.6 | learning rate: 4.772870E-06 | global batch size: 64 | lm loss: 7.570689E-01 | loss scale: 1.0 | grad norm: 1.033 | number of skipped iterations: 0 | number of nan iterations: 0 | + [2024-11-27 17:09:03] iteration 165/ 1000 | consumed samples: 10560 | elapsed time per iteration (ms): 114128.6 | throughput per GPU (TFLOP/s/GPU): 67.5 | learning rate: 4.769522E-06 | global batch size: 64 | lm loss: 6.929017E-01 | loss scale: 1.0 | grad norm: 1.047 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x55d95873d100] mmco: unref short failure +[h264 @ 0x55d95873d100] mmco: unref short failure +[h264 @ 0x555defd60880] mmco: unref short failure +[h264 @ 0x555defd60880] mmco: unref short failure +[h264 @ 0x55d95873d100] mmco: unref short failure +[h264 @ 0x55d95873d100] mmco: unref short failure +[h264 @ 0x555defd60880] mmco: unref short failure +[h264 @ 0x555defd60880] mmco: unref short failure +[h264 @ 0x555dee588f00] mmco: unref short failure +[h264 @ 0x555dee588f00] mmco: unref short failure +[h264 @ 0x55d95c76c1c0] mmco: unref short failure +[h264 @ 0x55d95c76c1c0] mmco: unref short failure +[h264 @ 0x555dee588f00] mmco: unref short failure +[h264 @ 0x555dee588f00] mmco: unref short failure +[h264 @ 0x55d95c76c1c0] mmco: unref short failure +[h264 @ 0x55d95c76c1c0] mmco: unref short failure +[h264 @ 0x555dedfc6580] mmco: unref short failure +[h264 @ 0x555dedfc6580] mmco: unref short failure +[h264 @ 0x555dedfc6580] mmco: unref short failure +[h264 @ 0x555ded679600] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x55d955e4acc0] mmco: unref short failure +[h264 @ 0x555ded679600] mmco: unref short failure +[h264 @ 0x55d955e4acc0] mmco: unref short failure +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x55d955d4f640] mmco: unref short failure +[h264 @ 0x55d955d4f640] mmco: unref short failure + [2024-11-27 17:10:26] iteration 166/ 1000 | consumed samples: 10624 | elapsed time per iteration (ms): 82536.4 | throughput per GPU (TFLOP/s/GPU): 93.4 | learning rate: 4.766150E-06 | global batch size: 64 | lm loss: 6.568977E-01 | loss scale: 1.0 | grad norm: 0.917 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555defd60880] mmco: unref short failure +[h264 @ 0x55d957a03080] mmco: unref short failure +[h264 @ 0x555defd60880] mmco: unref short failure +[h264 @ 0x555defd60880] mmco: unref short failure +[h264 @ 0x55d957a03080] mmco: unref short failure +[h264 @ 0x55d957a03080] mmco: unref short failure +[h264 @ 0x555dedfc6580] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x55d955d4f640] mmco: unref short failure +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x55d955d4f640] mmco: unref short failure +[h264 @ 0x555df4169bc0] mmco: unref short failure + [2024-11-27 17:11:55] iteration 167/ 1000 | consumed samples: 10688 | elapsed time per iteration (ms): 89932.8 | throughput per GPU (TFLOP/s/GPU): 85.7 | learning rate: 4.762755E-06 | global batch size: 64 | lm loss: 7.344331E-01 | loss scale: 1.0 | grad norm: 1.168 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x55d956767080] mmco: unref short failure +[h264 @ 0x555dee6ec240] mmco: unref short failure +[h264 @ 0x555dedfc6580] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure + [2024-11-27 17:13:22] iteration 168/ 1000 | consumed samples: 10752 | elapsed time per iteration (ms): 86819.8 | throughput per GPU (TFLOP/s/GPU): 88.8 | learning rate: 4.759337E-06 | global batch size: 64 | lm loss: 7.115574E-01 | loss scale: 1.0 | grad norm: 0.818 | number of skipped iterations: 0 | number of nan iterations: 0 | + [2024-11-27 17:14:44] iteration 169/ 1000 | consumed samples: 10816 | elapsed time per iteration (ms): 81933.9 | throughput per GPU (TFLOP/s/GPU): 94.1 | learning rate: 4.755896E-06 | global batch size: 64 | lm loss: 7.456884E-01 | loss scale: 1.0 | grad norm: 0.996 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x55d955d4f640] mmco: unref short failure +[h264 @ 0x55d955d4f640] mmco: unref short failure +[h264 @ 0x555ded679600] mmco: unref short failure +[h264 @ 0x555ded679600] mmco: unref short failure +[h264 @ 0x55d955e4acc0] mmco: unref short failure +[h264 @ 0x55d955e4acc0] mmco: unref short failure +[h264 @ 0x555ded696c00] mmco: unref short failure +[h264 @ 0x55d9560849c0] mmco: unref short failure +[h264 @ 0x555ded679600] mmco: unref short failure +[h264 @ 0x55d955e4acc0] mmco: unref short failure +[h264 @ 0x55d9560849c0] mmco: unref short failure +[h264 @ 0x55d9560849c0] mmco: unref short failure +[h264 @ 0x555ded696c00] mmco: unref short failure +[h264 @ 0x555ded696c00] mmco: unref short failure +[h264 @ 0x555ded679600] mmco: unref short failure +[h264 @ 0x555ded679600] mmco: unref short failure +[h264 @ 0x55d955e4acc0] mmco: unref short failure +[h264 @ 0x55d955e4acc0] mmco: unref short failure +[h264 @ 0x555ded679600] mmco: unref short failure +[h264 @ 0x55d955e4acc0] mmco: unref short failure +[h264 @ 0x555ded70ae00] mmco: unref short failure +[h264 @ 0x555ded70ae00] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x555ded70ae00] mmco: unref short failure +[h264 @ 0x555ded70ae00] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x55d956767080] mmco: unref short failure +[h264 @ 0x55d956767080] mmco: unref short failure +[h264 @ 0x555dee6ec240] mmco: unref short failure +[h264 @ 0x555dee6ec240] mmco: unref short failure +[h264 @ 0x55d956767080] mmco: unref short failure +[h264 @ 0x55d956767080] mmco: unref short failure +[h264 @ 0x555dee6ec240] mmco: unref short failure +[h264 @ 0x555dee6ec240] mmco: unref short failure +[h264 @ 0x55d956767080] mmco: unref short failure +[h264 @ 0x55d956767080] mmco: unref short failure +[h264 @ 0x555dee6ec240] mmco: unref short failure +[h264 @ 0x555dee6ec240] mmco: unref short failure +[h264 @ 0x55d956767080] mmco: unref short failure +[h264 @ 0x55d956767080] mmco: unref short failure +[h264 @ 0x555dee6ec240] mmco: unref short failure +[h264 @ 0x555dee6ec240] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x555dece4d600] mmco: unref short failure +[h264 @ 0x555dece4d600] mmco: unref short failure +[h264 @ 0x555dedfc6580] mmco: unref short failure +[h264 @ 0x555dedfc6580] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x555dedfc6580] mmco: unref short failure +[h264 @ 0x555dedfc6580] mmco: unref short failure +[h264 @ 0x555dedfc6580] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x555dedfc6580] mmco: unref short failure +[h264 @ 0x555dedfc6580] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x555dedfc6580] mmco: unref short failure +[h264 @ 0x555dedfc6580] mmco: unref short failure +[h264 @ 0x555dedfc6580] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure + [2024-11-27 17:16:22] iteration 170/ 1000 | consumed samples: 10880 | elapsed time per iteration (ms): 97941.6 | throughput per GPU (TFLOP/s/GPU): 78.7 | learning rate: 4.752432E-06 | global batch size: 64 | lm loss: 7.081493E-01 | loss scale: 1.0 | grad norm: 0.980 | number of skipped iterations: 0 | number of nan iterations: 0 | + [2024-11-27 17:19:09] iteration 171/ 1000 | consumed samples: 10944 | elapsed time per iteration (ms): 166384.3 | throughput per GPU (TFLOP/s/GPU): 46.3 | learning rate: 4.748944E-06 | global batch size: 64 | lm loss: 6.956521E-01 | loss scale: 1.0 | grad norm: 0.828 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x555dedfc6580] mmco: unref short failure +[h264 @ 0x555dedfc6580] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x555dedfc6580] mmco: unref short failure +[h264 @ 0x555dece38b80] mmco: unref short failure +[h264 @ 0x55d9560849c0] mmco: unref short failure + [2024-11-27 17:20:29] iteration 172/ 1000 | consumed samples: 11008 | elapsed time per iteration (ms): 80851.7 | throughput per GPU (TFLOP/s/GPU): 95.3 | learning rate: 4.745434E-06 | global batch size: 64 | lm loss: 7.245098E-01 | loss scale: 1.0 | grad norm: 1.193 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x55d956f36840] mmco: unref short failure +[h264 @ 0x55d956f36840] mmco: unref short failure +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x55d957f0f640] mmco: unref short failure +[h264 @ 0x55d957f0f640] mmco: unref short failure + [2024-11-27 17:21:57] iteration 173/ 1000 | consumed samples: 11072 | elapsed time per iteration (ms): 87496.9 | throughput per GPU (TFLOP/s/GPU): 88.1 | learning rate: 4.741900E-06 | global batch size: 64 | lm loss: 7.148576E-01 | loss scale: 1.0 | grad norm: 0.923 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x55d957a1cec0] mmco: unref short failure +[h264 @ 0x555ded0b5480] mmco: unref short failure +[h264 @ 0x555ded70ae00] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x55d957a1cec0] mmco: unref short failure +[h264 @ 0x55d957a1cec0] mmco: unref short failure +[h264 @ 0x555ded0b5480] mmco: unref short failure +[h264 @ 0x555ded0b5480] mmco: unref short failure +[h264 @ 0x555ded70ae00] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x55d957a1cec0] mmco: unref short failure +[h264 @ 0x555ded0b5480] mmco: unref short failure +[h264 @ 0x55d957a1cec0] mmco: unref short failure +[h264 @ 0x555ded0b5480] mmco: unref short failure +[h264 @ 0x55d957a1cec0] mmco: unref short failure +[h264 @ 0x555ded0b5480] mmco: unref short failure +[h264 @ 0x55d957a1cec0] mmco: unref short failure +[h264 @ 0x55d957a1cec0] mmco: unref short failure +[h264 @ 0x555ded0b5480] mmco: unref short failure +[h264 @ 0x555ded0b5480] mmco: unref short failure +[h264 @ 0x55d957a1cec0] mmco: unref short failure +[h264 @ 0x55d957a1cec0] mmco: unref short failure +[h264 @ 0x555ded0b5480] mmco: unref short failure +[h264 @ 0x555ded0b5480] mmco: unref short failure + [2024-11-27 17:23:19] iteration 174/ 1000 | consumed samples: 11136 | elapsed time per iteration (ms): 81798.7 | throughput per GPU (TFLOP/s/GPU): 94.2 | learning rate: 4.738344E-06 | global batch size: 64 | lm loss: 7.298409E-01 | loss scale: 1.0 | grad norm: 0.868 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x55d956defa80] mmco: unref short failure +[h264 @ 0x55d956defa80] mmco: unref short failure +[h264 @ 0x555ded679600] mmco: unref short failure +[h264 @ 0x555ded679600] mmco: unref short failure +[h264 @ 0x55d956f98f80] mmco: unref short failure +[h264 @ 0x55d956f98f80] mmco: unref short failure +[h264 @ 0x55d956f98f80] mmco: unref short failure +[h264 @ 0x55d956f98f80] mmco: unref short failure +[h264 @ 0x555ded679600] mmco: unref short failure +[h264 @ 0x555ded679600] mmco: unref short failure +[h264 @ 0x555def251b40] mmco: unref short failure +[h264 @ 0x555def251b40] mmco: unref short failure +[h264 @ 0x55d956f36840] mmco: unref short failure +[h264 @ 0x55d956f36840] mmco: unref short failure + [2024-11-27 17:25:03] iteration 175/ 1000 | consumed samples: 11200 | elapsed time per iteration (ms): 104541.5 | throughput per GPU (TFLOP/s/GPU): 73.7 | learning rate: 4.734764E-06 | global batch size: 64 | lm loss: 7.137690E-01 | loss scale: 1.0 | grad norm: 0.988 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555ded70ae00] mmco: unref short failure +[h264 @ 0x555ded70ae00] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure + [2024-11-27 17:26:48] iteration 176/ 1000 | consumed samples: 11264 | elapsed time per iteration (ms): 104206.7 | throughput per GPU (TFLOP/s/GPU): 74.0 | learning rate: 4.731162E-06 | global batch size: 64 | lm loss: 6.374616E-01 | loss scale: 1.0 | grad norm: 0.791 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x55d9568d0900] mmco: unref short failure +[h264 @ 0x55d9568d0900] mmco: unref short failure +[h264 @ 0x555dee1d4480] mmco: unref short failure +[h264 @ 0x555dee1d4480] mmco: unref short failure +[h264 @ 0x555dee1d4480] mmco: unref short failure +[h264 @ 0x555dee1d4480] mmco: unref short failure +[h264 @ 0x55d9568d0900] mmco: unref short failure +[h264 @ 0x55d9568d0900] mmco: unref short failure +[h264 @ 0x555ded70ae00] mmco: unref short failure +[h264 @ 0x555ded70ae00] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure + [2024-11-27 17:28:10] iteration 177/ 1000 | consumed samples: 11328 | elapsed time per iteration (ms): 82484.2 | throughput per GPU (TFLOP/s/GPU): 93.5 | learning rate: 4.727537E-06 | global batch size: 64 | lm loss: 6.290931E-01 | loss scale: 1.0 | grad norm: 0.993 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x55d956defa80] mmco: unref short failure +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x55d956defa80] mmco: unref short failure +[h264 @ 0x55d956defa80] mmco: unref short failure +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x55d956defa80] mmco: unref short failure +[h264 @ 0x55d956defa80] mmco: unref short failure +[h264 @ 0x555dedfc6580] mmco: unref short failure +[h264 @ 0x555dedfc6580] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x555dedfc6580] mmco: unref short failure +[h264 @ 0x555dedfc6580] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x555dedfc6580] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x555dedfc6580] mmco: unref short failure +[h264 @ 0x555dedfc6580] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x555ded70ae00] mmco: unref short failure +[h264 @ 0x555ded70ae00] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x555df1cb0600] mmco: unref short failure +[h264 @ 0x55d956b4f800] mmco: unref short failure + [2024-11-27 17:29:48] iteration 178/ 1000 | consumed samples: 11392 | elapsed time per iteration (ms): 98160.7 | throughput per GPU (TFLOP/s/GPU): 78.5 | learning rate: 4.723889E-06 | global batch size: 64 | lm loss: 6.425977E-01 | loss scale: 1.0 | grad norm: 0.863 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555dee1d4480] mmco: unref short failure +[h264 @ 0x555dee1d4480] mmco: unref short failure +[h264 @ 0x55d9593709c0] mmco: unref short failure +[h264 @ 0x55d9593709c0] mmco: unref short failure +[h264 @ 0x555dee1d4480] mmco: unref short failure +[h264 @ 0x55d9593709c0] mmco: unref short failure +[h264 @ 0x555dee1d4480] mmco: unref short failure +[h264 @ 0x55d9593709c0] mmco: unref short failure +[h264 @ 0x555dee1d4480] mmco: unref short failure +[h264 @ 0x555dee1d4480] mmco: unref short failure +[h264 @ 0x55d9593709c0] mmco: unref short failure +[h264 @ 0x55d9593709c0] mmco: unref short failure +[h264 @ 0x555dedfc6580] mmco: unref short failure +[h264 @ 0x555dedfc6580] mmco: unref short failure +[h264 @ 0x555dee1d4480] mmco: unref short failure +[h264 @ 0x55d956b88f40] mmco: unref short failure +[h264 @ 0x55d956b88f40] mmco: unref short failure +[h264 @ 0x55d9593709c0] mmco: unref short failure +[h264 @ 0x555dee1d4480] mmco: unref short failure +[h264 @ 0x55d9593709c0] mmco: unref short failure +[h264 @ 0x555dee1d4480] mmco: unref short failure +[h264 @ 0x55d9593709c0] mmco: unref short failure +[h264 @ 0x555dee1d4480] mmco: unref short failure +[h264 @ 0x55d9593709c0] mmco: unref short failure +[h264 @ 0x555dee1d4480] mmco: unref short failure +[h264 @ 0x555dee1d4480] mmco: unref short failure +[h264 @ 0x55d9593709c0] mmco: unref short failure +[h264 @ 0x55d9593709c0] mmco: unref short failure +[h264 @ 0x555dee1d4480] mmco: unref short failure +[h264 @ 0x555dee1d4480] mmco: unref short failure +[h264 @ 0x55d9593709c0] mmco: unref short failure +[h264 @ 0x55d9593709c0] mmco: unref short failure +[h264 @ 0x555dee1d4480] mmco: unref short failure +[h264 @ 0x555dee1d4480] mmco: unref short failure +[h264 @ 0x555dee1d4480] mmco: unref short failure +[h264 @ 0x555dee1d4480] mmco: unref short failure +[h264 @ 0x55d9593709c0] mmco: unref short failure +[h264 @ 0x55d9593709c0] mmco: unref short failure +[h264 @ 0x55d9593709c0] mmco: unref short failure +[h264 @ 0x55d9593709c0] mmco: unref short failure + [2024-11-27 17:31:09] iteration 179/ 1000 | consumed samples: 11456 | elapsed time per iteration (ms): 80280.2 | throughput per GPU (TFLOP/s/GPU): 96.0 | learning rate: 4.720218E-06 | global batch size: 64 | lm loss: 7.326845E-01 | loss scale: 1.0 | grad norm: 1.031 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x55d959911280] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x55d959911280] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x55d959911280] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x55d956fbbd80] mmco: unref short failure +[h264 @ 0x55d956fbbd80] mmco: unref short failure +[h264 @ 0x55d956fbbd80] mmco: unref short failure +[h264 @ 0x55d956fbbd80] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x55d956fbbd80] mmco: unref short failure +[h264 @ 0x55d956fbbd80] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x55d956fbbd80] mmco: unref short failure +[h264 @ 0x55d956fbbd80] mmco: unref short failure + [2024-11-27 17:32:37] iteration 180/ 1000 | consumed samples: 11520 | elapsed time per iteration (ms): 88866.5 | throughput per GPU (TFLOP/s/GPU): 86.7 | learning rate: 4.716524E-06 | global batch size: 64 | lm loss: 6.877599E-01 | loss scale: 1.0 | grad norm: 0.889 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x55d956b88f40] mmco: unref short failure +[h264 @ 0x555dedfc6580] mmco: unref short failure +[h264 @ 0x55d956b88f40] mmco: unref short failure +[h264 @ 0x555dedfc6580] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x55d959cd93c0] mmco: unref short failure +[h264 @ 0x55d959cd93c0] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x55d959cd93c0] mmco: unref short failure +[h264 @ 0x55d959cd93c0] mmco: unref short failure +[h264 @ 0x55d959cd93c0] mmco: unref short failure +[h264 @ 0x55d959cd93c0] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x55d959cd93c0] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x55d959cd93c0] mmco: unref short failure +[h264 @ 0x55d959cd93c0] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x555ded679600] mmco: unref short failure +[h264 @ 0x555ded679600] mmco: unref short failure +[h264 @ 0x55d955e4acc0] mmco: unref short failure +[h264 @ 0x55d955e4acc0] mmco: unref short failure +[h264 @ 0x555ded679600] mmco: unref short failure +[h264 @ 0x555ded679600] mmco: unref short failure +[h264 @ 0x55d955e4acc0] mmco: unref short failure +[h264 @ 0x55d955e4acc0] mmco: unref short failure + [2024-11-27 17:34:02] iteration 181/ 1000 | consumed samples: 11584 | elapsed time per iteration (ms): 84486.9 | throughput per GPU (TFLOP/s/GPU): 91.2 | learning rate: 4.712808E-06 | global batch size: 64 | lm loss: 6.831203E-01 | loss scale: 1.0 | grad norm: 0.788 | number of skipped iterations: 0 | number of nan iterations: 0 | + [2024-11-27 17:35:22] iteration 182/ 1000 | consumed samples: 11648 | elapsed time per iteration (ms): 80280.8 | throughput per GPU (TFLOP/s/GPU): 96.0 | learning rate: 4.709068E-06 | global batch size: 64 | lm loss: 7.477552E-01 | loss scale: 1.0 | grad norm: 0.931 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x55d956fbbd80] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x55d955e4acc0] mmco: unref short failure +[h264 @ 0x55d955e4acc0] mmco: unref short failure +[h264 @ 0x55d955e4acc0] mmco: unref short failure +[h264 @ 0x55d955e4acc0] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x55d955e4acc0] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x55d955e4acc0] mmco: unref short failure +[h264 @ 0x55d955e4acc0] mmco: unref short failure + [2024-11-27 17:36:37] iteration 183/ 1000 | consumed samples: 11712 | elapsed time per iteration (ms): 74934.2 | throughput per GPU (TFLOP/s/GPU): 102.9 | learning rate: 4.705307E-06 | global batch size: 64 | lm loss: 6.492839E-01 | loss scale: 1.0 | grad norm: 0.789 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x55d955e4acc0] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x555dee1d4480] mmco: unref short failure +[h264 @ 0x555dee1d4480] mmco: unref short failure +[h264 @ 0x55d9568d0900] mmco: unref short failure +[h264 @ 0x55d9568d0900] mmco: unref short failure +[h264 @ 0x55d9568d0900] mmco: unref short failure +[h264 @ 0x555dee1d4480] mmco: unref short failure + [2024-11-27 17:38:25] iteration 184/ 1000 | consumed samples: 11776 | elapsed time per iteration (ms): 108061.1 | throughput per GPU (TFLOP/s/GPU): 71.3 | learning rate: 4.701522E-06 | global batch size: 64 | lm loss: 6.642103E-01 | loss scale: 1.0 | grad norm: 0.829 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x55d957a91640] mmco: unref short failure +[h264 @ 0x555dee104680] mmco: unref short failure +[h264 @ 0x55d956fbbd80] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x555dee104680] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x55d957a91640] mmco: unref short failure +[h264 @ 0x55d956fbbd80] mmco: unref short failure +[h264 @ 0x55d956fbbd80] mmco: unref short failure +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x55d959cd93c0] mmco: unref short failure +[h264 @ 0x55d959cd93c0] mmco: unref short failure +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x55d959cd93c0] mmco: unref short failure + [2024-11-27 17:39:58] iteration 185/ 1000 | consumed samples: 11840 | elapsed time per iteration (ms): 92756.7 | throughput per GPU (TFLOP/s/GPU): 83.1 | learning rate: 4.697715E-06 | global batch size: 64 | lm loss: 6.904918E-01 | loss scale: 1.0 | grad norm: 1.049 | number of skipped iterations: 0 | number of nan iterations: 0 | + [2024-11-27 17:41:22] iteration 186/ 1000 | consumed samples: 11904 | elapsed time per iteration (ms): 83547.8 | throughput per GPU (TFLOP/s/GPU): 92.3 | learning rate: 4.693886E-06 | global batch size: 64 | lm loss: 7.127905E-01 | loss scale: 1.0 | grad norm: 0.798 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555def216fc0] mmco: unref short failure +[h264 @ 0x555def216fc0] mmco: unref short failure +[h264 @ 0x55d955d53340] mmco: unref short failure +[h264 @ 0x55d955d53340] mmco: unref short failure +[h264 @ 0x555def216fc0] mmco: unref short failure +[h264 @ 0x555def216fc0] mmco: unref short failure +[h264 @ 0x55d955d53340] mmco: unref short failure +[h264 @ 0x55d955d53340] mmco: unref short failure +[h264 @ 0x555dee1d4480] mmco: unref short failure +[h264 @ 0x55d956f36840] mmco: unref short failure +[h264 @ 0x55d956b88f40] mmco: unref short failure +[h264 @ 0x55d956b88f40] mmco: unref short failure +[h264 @ 0x555dedfc6580] mmco: unref short failure +[h264 @ 0x555dedfc6580] mmco: unref short failure +[h264 @ 0x555def216fc0] mmco: unref short failure +[h264 @ 0x55d955d53340] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x55d955e4acc0] mmco: unref short failure +[h264 @ 0x55d955e4acc0] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x55d955e4acc0] mmco: unref short failure +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure + [2024-11-27 17:43:29] iteration 187/ 1000 | consumed samples: 11968 | elapsed time per iteration (ms): 127045.6 | throughput per GPU (TFLOP/s/GPU): 60.7 | learning rate: 4.690034E-06 | global batch size: 64 | lm loss: 7.377602E-01 | loss scale: 1.0 | grad norm: 0.866 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555dedfc6580] mmco: unref short failure +[h264 @ 0x555def216fc0] mmco: unref short failure +[h264 @ 0x555def216fc0] mmco: unref short failure +[h264 @ 0x55d956b88f40] mmco: unref short failure +[h264 @ 0x55d955d53340] mmco: unref short failure +[h264 @ 0x55d955d53340] mmco: unref short failure +[h264 @ 0x555dedfc6580] mmco: unref short failure +[h264 @ 0x555dedfc6580] mmco: unref short failure +[h264 @ 0x55d956b88f40] mmco: unref short failure +[h264 @ 0x55d956b88f40] mmco: unref short failure +[h264 @ 0x555def216fc0] mmco: unref short failure +[h264 @ 0x555def216fc0] mmco: unref short failure +[h264 @ 0x55d955d53340] mmco: unref short failure +[h264 @ 0x55d955d53340] mmco: unref short failure +[h264 @ 0x555def216fc0] mmco: unref short failure +[h264 @ 0x555def216fc0] mmco: unref short failure +[h264 @ 0x55d955d53340] mmco: unref short failure +[h264 @ 0x55d955d53340] mmco: unref short failure +[h264 @ 0x555dedfc6580] mmco: unref short failure +[h264 @ 0x55d956b88f40] mmco: unref short failure + [2024-11-27 17:44:43] iteration 188/ 1000 | consumed samples: 12032 | elapsed time per iteration (ms): 74238.4 | throughput per GPU (TFLOP/s/GPU): 103.8 | learning rate: 4.686160E-06 | global batch size: 64 | lm loss: 6.513256E-01 | loss scale: 1.0 | grad norm: 0.750 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x55d959cd93c0] mmco: unref short failure +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x55d959cd93c0] mmco: unref short failure +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x55d959cd93c0] mmco: unref short failure +[h264 @ 0x55d959cd93c0] mmco: unref short failure +[h264 @ 0x55d957a91640] mmco: unref short failure +[h264 @ 0x55d957a91640] mmco: unref short failure +[h264 @ 0x555dee104680] mmco: unref short failure +[h264 @ 0x555dee104680] mmco: unref short failure +[h264 @ 0x55d957a91640] mmco: unref short failure +[h264 @ 0x555dee104680] mmco: unref short failure +[h264 @ 0x55d957a91640] mmco: unref short failure +[h264 @ 0x555dee104680] mmco: unref short failure + [2024-11-27 17:46:07] iteration 189/ 1000 | consumed samples: 12096 | elapsed time per iteration (ms): 84273.9 | throughput per GPU (TFLOP/s/GPU): 91.5 | learning rate: 4.682263E-06 | global batch size: 64 | lm loss: 6.903006E-01 | loss scale: 1.0 | grad norm: 0.886 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555def216fc0] mmco: unref short failure +[h264 @ 0x55d955d53340] mmco: unref short failure +[h264 @ 0x555dee1d4480] mmco: unref short failure +[h264 @ 0x55d9568d0900] mmco: unref short failure +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x55d959b29f40] mmco: unref short failure +[h264 @ 0x55d959b29f40] mmco: unref short failure +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x55d959b29f40] mmco: unref short failure +[h264 @ 0x555def216fc0] mmco: unref short failure +[h264 @ 0x555def216fc0] mmco: unref short failure +[h264 @ 0x55d955d53340] mmco: unref short failure +[h264 @ 0x55d955d53340] mmco: unref short failure +[h264 @ 0x55d955d53340] mmco: unref short failure +[h264 @ 0x55d955d53340] mmco: unref short failure +[h264 @ 0x555def216fc0] mmco: unref short failure +[h264 @ 0x555def216fc0] mmco: unref short failure + [2024-11-27 17:47:21] iteration 190/ 1000 | consumed samples: 12160 | elapsed time per iteration (ms): 74023.6 | throughput per GPU (TFLOP/s/GPU): 104.1 | learning rate: 4.678344E-06 | global batch size: 64 | lm loss: 6.182513E-01 | loss scale: 1.0 | grad norm: 0.937 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555def216fc0] mmco: unref short failure +[h264 @ 0x55d955d53340] mmco: unref short failure +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x555def216fc0] mmco: unref short failure +[h264 @ 0x55d957fbfd40] mmco: unref short failure +[h264 @ 0x55d957fbfd40] mmco: unref short failure +[h264 @ 0x55d955d53340] mmco: unref short failure +[h264 @ 0x555dee1d4480] mmco: unref short failure +[h264 @ 0x555dee1d4480] mmco: unref short failure +[h264 @ 0x55d9568d0900] mmco: unref short failure +[h264 @ 0x55d9568d0900] mmco: unref short failure +[h264 @ 0x555deee52400] mmco: unref short failure +[h264 @ 0x55d956f36840] mmco: unref short failure +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x55d957fbfd40] mmco: unref short failure +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x55d957fbfd40] mmco: unref short failure +[h264 @ 0x55d957fbfd40] mmco: unref short failure +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x55d957fbfd40] mmco: unref short failure + [2024-11-27 17:48:44] iteration 191/ 1000 | consumed samples: 12224 | elapsed time per iteration (ms): 82835.6 | throughput per GPU (TFLOP/s/GPU): 93.1 | learning rate: 4.674402E-06 | global batch size: 64 | lm loss: 7.302966E-01 | loss scale: 1.0 | grad norm: 1.009 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x55d957fbfd40] mmco: unref short failure +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x55d957fbfd40] mmco: unref short failure +[h264 @ 0x55d957fbfd40] mmco: unref short failure +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x55d957fbfd40] mmco: unref short failure +[h264 @ 0x555dedfc6580] mmco: unref short failure +[h264 @ 0x55d9586ad200] mmco: unref short failure +[h264 @ 0x555dedfc6580] mmco: unref short failure +[h264 @ 0x555dedfc6580] mmco: unref short failure +[h264 @ 0x55d9586ad200] mmco: unref short failure +[h264 @ 0x55d9586ad200] mmco: unref short failure +[h264 @ 0x555dedfc6580] mmco: unref short failure +[h264 @ 0x555dedfc6580] mmco: unref short failure +[h264 @ 0x55d9586ad200] mmco: unref short failure +[h264 @ 0x55d9586ad200] mmco: unref short failure +[h264 @ 0x555dedfc6580] mmco: unref short failure +[h264 @ 0x555dedfc6580] mmco: unref short failure +[h264 @ 0x55d9586ad200] mmco: unref short failure +[h264 @ 0x55d9586ad200] mmco: unref short failure + [2024-11-27 17:50:13] iteration 192/ 1000 | consumed samples: 12288 | elapsed time per iteration (ms): 88892.2 | throughput per GPU (TFLOP/s/GPU): 86.7 | learning rate: 4.670439E-06 | global batch size: 64 | lm loss: 7.525570E-01 | loss scale: 1.0 | grad norm: 0.821 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x55d955d53340] mmco: unref short failure +[h264 @ 0x555def216fc0] mmco: unref short failure +[h264 @ 0x555dee1d4480] mmco: unref short failure +[h264 @ 0x55d9593709c0] mmco: unref short failure +[h264 @ 0x55d955d53340] mmco: unref short failure +[h264 @ 0x55d955d53340] mmco: unref short failure +[h264 @ 0x555def216fc0] mmco: unref short failure +[h264 @ 0x555def216fc0] mmco: unref short failure +[h264 @ 0x555def216fc0] mmco: unref short failure +[h264 @ 0x555def216fc0] mmco: unref short failure +[h264 @ 0x55d955d53340] mmco: unref short failure +[h264 @ 0x55d955d53340] mmco: unref short failure +[h264 @ 0x555def216fc0] mmco: unref short failure +[h264 @ 0x555def216fc0] mmco: unref short failure +[h264 @ 0x55d955d53340] mmco: unref short failure +[h264 @ 0x55d955d53340] mmco: unref short failure +[h264 @ 0x555def216fc0] mmco: unref short failure +[h264 @ 0x555def216fc0] mmco: unref short failure +[h264 @ 0x55d955d53340] mmco: unref short failure +[h264 @ 0x55d955d53340] mmco: unref short failure +[h264 @ 0x555deee52400] mmco: unref short failure +[h264 @ 0x55d956f36840] mmco: unref short failure +[h264 @ 0x555deee52400] mmco: unref short failure +[h264 @ 0x55d956f36840] mmco: unref short failure + [2024-11-27 17:51:53] iteration 193/ 1000 | consumed samples: 12352 | elapsed time per iteration (ms): 100304.3 | throughput per GPU (TFLOP/s/GPU): 76.9 | learning rate: 4.666453E-06 | global batch size: 64 | lm loss: 7.345333E-01 | loss scale: 1.0 | grad norm: 0.944 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555deee52400] mmco: unref short failure +[h264 @ 0x55d956f36840] mmco: unref short failure + [2024-11-27 17:53:19] iteration 194/ 1000 | consumed samples: 12416 | elapsed time per iteration (ms): 86240.2 | throughput per GPU (TFLOP/s/GPU): 89.4 | learning rate: 4.662444E-06 | global batch size: 64 | lm loss: 7.116587E-01 | loss scale: 1.0 | grad norm: 1.096 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555dee1d4480] mmco: unref short failure +[h264 @ 0x55d9568d0900] mmco: unref short failure +[h264 @ 0x555dee1d4480] mmco: unref short failure +[h264 @ 0x55d9568d0900] mmco: unref short failure +[h264 @ 0x555dee1d4480] mmco: unref short failure +[h264 @ 0x555dee1d4480] mmco: unref short failure +[h264 @ 0x55d9568d0900] mmco: unref short failure +[h264 @ 0x55d9568d0900] mmco: unref short failure +[h264 @ 0x555dece9f180] mmco: unref short failure +[h264 @ 0x55d955e4acc0] mmco: unref short failure + [2024-11-27 17:54:53] iteration 195/ 1000 | consumed samples: 12480 | elapsed time per iteration (ms): 93884.9 | throughput per GPU (TFLOP/s/GPU): 82.1 | learning rate: 4.658414E-06 | global batch size: 64 | lm loss: 7.222701E-01 | loss scale: 1.0 | grad norm: 0.918 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555def251b40] mmco: unref short failure +[h264 @ 0x555def251b40] mmco: unref short failure +[h264 @ 0x55d957a03080] mmco: unref short failure +[h264 @ 0x55d957a03080] mmco: unref short failure +[h264 @ 0x555deee52400] mmco: unref short failure +[h264 @ 0x55d957a03080] mmco: unref short failure + [2024-11-27 17:56:14] iteration 196/ 1000 | consumed samples: 12544 | elapsed time per iteration (ms): 80589.4 | throughput per GPU (TFLOP/s/GPU): 95.7 | learning rate: 4.654361E-06 | global batch size: 64 | lm loss: 6.828011E-01 | loss scale: 1.0 | grad norm: 1.116 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555dee948bc0] mmco: unref short failure +[h264 @ 0x555dee948bc0] mmco: unref short failure +[h264 @ 0x55d956da5240] mmco: unref short failure +[h264 @ 0x55d956da5240] mmco: unref short failure + [2024-11-27 17:57:31] iteration 197/ 1000 | consumed samples: 12608 | elapsed time per iteration (ms): 76912.4 | throughput per GPU (TFLOP/s/GPU): 100.2 | learning rate: 4.650287E-06 | global batch size: 64 | lm loss: 6.875466E-01 | loss scale: 1.0 | grad norm: 1.037 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x555dec5a5140] mmco: unref short failure +[h264 @ 0x55d956d792c0] mmco: unref short failure +[h264 @ 0x555dee570b40] mmco: unref short failure +[h264 @ 0x55d955ffc880] mmco: unref short failure +[h264 @ 0x555dee570b40] mmco: unref short failure +[h264 @ 0x55d955ffc880] mmco: unref short failure +[h264 @ 0x555ded679600] mmco: unref short failure +[h264 @ 0x555ded679600] mmco: unref short failure +[h264 @ 0x555ded679600] mmco: unref short failure +[h264 @ 0x555ded679600] mmco: unref short failure +[h264 @ 0x555ded679600] mmco: unref short failure +[h264 @ 0x55d956f36840] mmco: unref short failure +[h264 @ 0x55d956f36840] mmco: unref short failure +[h264 @ 0x55d956f36840] mmco: unref short failure +[h264 @ 0x55d956f36840] mmco: unref short failure +[h264 @ 0x55d956f36840] mmco: unref short failure +[h264 @ 0x555ded679600] mmco: unref short failure +[h264 @ 0x555ded679600] mmco: unref short failure +[h264 @ 0x55d956f36840] mmco: unref short failure +[h264 @ 0x55d956f36840] mmco: unref short failure + [2024-11-27 17:58:54] iteration 198/ 1000 | consumed samples: 12672 | elapsed time per iteration (ms): 83182.9 | throughput per GPU (TFLOP/s/GPU): 92.7 | learning rate: 4.646190E-06 | global batch size: 64 | lm loss: 7.020935E-01 | loss scale: 1.0 | grad norm: 0.818 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x55d956f36840] mmco: unref short failure +[h264 @ 0x55d956f36840] mmco: unref short failure +[h264 @ 0x55d956f36840] mmco: unref short failure +[h264 @ 0x55d956f36840] mmco: unref short failure +[h264 @ 0x555dee570b40] mmco: unref short failure +[h264 @ 0x555dee570b40] mmco: unref short failure +[h264 @ 0x55d956f36840] mmco: unref short failure +[h264 @ 0x55d955ffc880] mmco: unref short failure +[h264 @ 0x55d955ffc880] mmco: unref short failure + [2024-11-27 18:00:11] iteration 199/ 1000 | consumed samples: 12736 | elapsed time per iteration (ms): 77283.6 | throughput per GPU (TFLOP/s/GPU): 99.7 | learning rate: 4.642072E-06 | global batch size: 64 | lm loss: 7.617804E-01 | loss scale: 1.0 | grad norm: 1.135 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x55d955ffc880] mmco: unref short failure +[h264 @ 0x555dee570b40] mmco: unref short failure +[h264 @ 0x55d955ffc880] mmco: unref short failure +[h264 @ 0x55d955ffc880] mmco: unref short failure +[h264 @ 0x555dee570b40] mmco: unref short failure +[h264 @ 0x555dee570b40] mmco: unref short failure +[h264 @ 0x55d955ffc880] mmco: unref short failure +[h264 @ 0x55d955ffc880] mmco: unref short failure +[h264 @ 0x555dee570b40] mmco: unref short failure +[h264 @ 0x555dee570b40] mmco: unref short failure +[h264 @ 0x555ded679600] mmco: unref short failure +[h264 @ 0x55d956f36840] mmco: unref short failure +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x55d957883f80] mmco: unref short failure +[h264 @ 0x55d957883f80] mmco: unref short failure +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x55d957883f80] mmco: unref short failure +[h264 @ 0x55d957883f80] mmco: unref short failure +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x55d957883f80] mmco: unref short failure + [2024-11-27 18:01:18] iteration 200/ 1000 | consumed samples: 12800 | elapsed time per iteration (ms): 66427.1 | throughput per GPU (TFLOP/s/GPU): 116.0 | learning rate: 4.637931E-06 | global batch size: 64 | lm loss: 6.670317E-01 | loss scale: 1.0 | grad norm: 1.136 | number of skipped iterations: 0 | number of nan iterations: 0 | +(min, max) time across ranks (ms): + save-checkpoint ................................: (209629.20, 209629.58) +[h264 @ 0x555dee570b40] mmco: unref short failure +[h264 @ 0x55d955ffc880] mmco: unref short failure + [2024-11-27 18:06:03] iteration 201/ 1000 | consumed samples: 12864 | elapsed time per iteration (ms): 75198.8 | throughput per GPU (TFLOP/s/GPU): 102.5 | learning rate: 4.633769E-06 | global batch size: 64 | lm loss: 7.316583E-01 | loss scale: 1.0 | grad norm: 0.879 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555ded679600] mmco: unref short failure +[h264 @ 0x555ded679600] mmco: unref short failure +[h264 @ 0x55d956f36840] mmco: unref short failure +[h264 @ 0x55d956f36840] mmco: unref short failure +[h264 @ 0x55d956f36840] mmco: unref short failure +[h264 @ 0x55d956f36840] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x555dee570b40] mmco: unref short failure +[h264 @ 0x555dee570b40] mmco: unref short failure +[h264 @ 0x55d955ffc880] mmco: unref short failure +[h264 @ 0x55d955ffc880] mmco: unref short failure +[h264 @ 0x555dee570b40] mmco: unref short failure +[h264 @ 0x55d955ffc880] mmco: unref short failure +[h264 @ 0x555dee570b40] mmco: unref short failure +[h264 @ 0x55d955ffc880] mmco: unref short failure + [2024-11-27 18:07:42] iteration 202/ 1000 | consumed samples: 12928 | elapsed time per iteration (ms): 99254.2 | throughput per GPU (TFLOP/s/GPU): 77.7 | learning rate: 4.629585E-06 | global batch size: 64 | lm loss: 6.787452E-01 | loss scale: 1.0 | grad norm: 0.797 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x55d956f98f80] mmco: unref short failure +[h264 @ 0x55d956f98f80] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x55d956f98f80] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x55d956f98f80] mmco: unref short failure +[h264 @ 0x555dee570b40] mmco: unref short failure +[h264 @ 0x555dee570b40] mmco: unref short failure +[h264 @ 0x55d955ffc880] mmco: unref short failure +[h264 @ 0x55d955ffc880] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x55d956f98f80] mmco: unref short failure +[h264 @ 0x55d956f98f80] mmco: unref short failure +[h264 @ 0x555dec811980] mmco: unref short failure +[h264 @ 0x55d957a03080] mmco: unref short failure +[h264 @ 0x555dec811980] mmco: unref short failure +[h264 @ 0x55d957a03080] mmco: unref short failure +[h264 @ 0x555dec811980] mmco: unref short failure +[h264 @ 0x555dec811980] mmco: unref short failure +[h264 @ 0x55d957a03080] mmco: unref short failure +[h264 @ 0x55d957a03080] mmco: unref short failure + [2024-11-27 18:09:17] iteration 203/ 1000 | consumed samples: 12992 | elapsed time per iteration (ms): 94646.6 | throughput per GPU (TFLOP/s/GPU): 81.4 | learning rate: 4.625378E-06 | global batch size: 64 | lm loss: 7.176006E-01 | loss scale: 1.0 | grad norm: 1.155 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x55d956f98f80] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x55d956f98f80] mmco: unref short failure +[h264 @ 0x55d956f98f80] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x55d956f98f80] mmco: unref short failure +[h264 @ 0x55d956f98f80] mmco: unref short failure + [2024-11-27 18:10:40] iteration 204/ 1000 | consumed samples: 13056 | elapsed time per iteration (ms): 83515.3 | throughput per GPU (TFLOP/s/GPU): 92.3 | learning rate: 4.621151E-06 | global batch size: 64 | lm loss: 7.039618E-01 | loss scale: 1.0 | grad norm: 0.840 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555dece9f180] mmco: unref short failure +[h264 @ 0x55d957a03080] mmco: unref short failure +[h264 @ 0x555dece9f180] mmco: unref short failure +[h264 @ 0x555dece9f180] mmco: unref short failure +[h264 @ 0x55d957a03080] mmco: unref short failure +[h264 @ 0x55d957a03080] mmco: unref short failure +[h264 @ 0x555dece9f180] mmco: unref short failure +[h264 @ 0x555dece9f180] mmco: unref short failure +[h264 @ 0x55d957a03080] mmco: unref short failure +[h264 @ 0x55d957a03080] mmco: unref short failure +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x55d959b34680] mmco: unref short failure +[h264 @ 0x555dece9f180] mmco: unref short failure +[h264 @ 0x55d957a03080] mmco: unref short failure +[h264 @ 0x555dece9f180] mmco: unref short failure +[h264 @ 0x555dece9f180] mmco: unref short failure +[h264 @ 0x55d957a03080] mmco: unref short failure +[h264 @ 0x55d957a03080] mmco: unref short failure +[h264 @ 0x555dece9f180] mmco: unref short failure +[h264 @ 0x555dece9f180] mmco: unref short failure +[h264 @ 0x55d957a03080] mmco: unref short failure +[h264 @ 0x55d957a03080] mmco: unref short failure +[h264 @ 0x555dece9f180] mmco: unref short failure +[h264 @ 0x555dece9f180] mmco: unref short failure +[h264 @ 0x55d957a03080] mmco: unref short failure +[h264 @ 0x55d957a03080] mmco: unref short failure +[h264 @ 0x555dece9f180] mmco: unref short failure +[h264 @ 0x55d957a03080] mmco: unref short failure +[h264 @ 0x555dece9f180] mmco: unref short failure +[h264 @ 0x555dece9f180] mmco: unref short failure +[h264 @ 0x555dece9f180] mmco: unref short failure +[h264 @ 0x55d957a03080] mmco: unref short failure +[h264 @ 0x55d957a03080] mmco: unref short failure +[h264 @ 0x55d957a03080] mmco: unref short failure +[h264 @ 0x55d95742d280] mmco: unref short failure +[h264 @ 0x555dece9f180] mmco: unref short failure + [2024-11-27 18:12:17] iteration 205/ 1000 | consumed samples: 13120 | elapsed time per iteration (ms): 96650.8 | throughput per GPU (TFLOP/s/GPU): 79.8 | learning rate: 4.616901E-06 | global batch size: 64 | lm loss: 6.275778E-01 | loss scale: 1.0 | grad norm: 1.519 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x55d959b34680] mmco: unref short failure +[h264 @ 0x55d959b34680] mmco: unref short failure +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x55d959b34680] mmco: unref short failure +[h264 @ 0x55d959b34680] mmco: unref short failure +[h264 @ 0x55d959b34680] mmco: unref short failure +[h264 @ 0x555dece9f180] mmco: unref short failure +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x55d95742d280] mmco: unref short failure +[h264 @ 0x55d959b34680] mmco: unref short failure +[h264 @ 0x55d959b34680] mmco: unref short failure +[h264 @ 0x555dece9f180] mmco: unref short failure +[h264 @ 0x55d95742d280] mmco: unref short failure +[h264 @ 0x555dee570b40] mmco: unref short failure +[h264 @ 0x555dee570b40] mmco: unref short failure +[h264 @ 0x55d955ffc880] mmco: unref short failure +[h264 @ 0x55d955ffc880] mmco: unref short failure +[h264 @ 0x55d955ffc880] mmco: unref short failure +[h264 @ 0x555dee570b40] mmco: unref short failure + [2024-11-27 18:13:47] iteration 206/ 1000 | consumed samples: 13184 | elapsed time per iteration (ms): 89884.3 | throughput per GPU (TFLOP/s/GPU): 85.8 | learning rate: 4.612630E-06 | global batch size: 64 | lm loss: 6.750375E-01 | loss scale: 1.0 | grad norm: 1.124 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x55d955ffc880] mmco: unref short failure +[h264 @ 0x555dee570b40] mmco: unref short failure +[h264 @ 0x555dee570b40] mmco: unref short failure +[h264 @ 0x55d955ffc880] mmco: unref short failure +[h264 @ 0x555dee570b40] mmco: unref short failure +[h264 @ 0x55d955ffc880] mmco: unref short failure + [2024-11-27 18:15:16] iteration 207/ 1000 | consumed samples: 13248 | elapsed time per iteration (ms): 88931.2 | throughput per GPU (TFLOP/s/GPU): 86.7 | learning rate: 4.608337E-06 | global batch size: 64 | lm loss: 7.806692E-01 | loss scale: 1.0 | grad norm: 1.076 | number of skipped iterations: 0 | number of nan iterations: 0 | + [2024-11-27 18:16:56] iteration 208/ 1000 | consumed samples: 13312 | elapsed time per iteration (ms): 100571.4 | throughput per GPU (TFLOP/s/GPU): 76.6 | learning rate: 4.604022E-06 | global batch size: 64 | lm loss: 6.860654E-01 | loss scale: 1.0 | grad norm: 1.029 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555df1cb0600] mmco: unref short failure +[h264 @ 0x555df1cb0600] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure + [2024-11-27 18:18:18] iteration 209/ 1000 | consumed samples: 13376 | elapsed time per iteration (ms): 81771.6 | throughput per GPU (TFLOP/s/GPU): 94.3 | learning rate: 4.599686E-06 | global batch size: 64 | lm loss: 7.601146E-01 | loss scale: 1.0 | grad norm: 0.966 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x55d959b29f40] mmco: unref short failure +[h264 @ 0x555dedf05880] mmco: unref short failure +[h264 @ 0x555df0bdeb40] mmco: unref short failure +[h264 @ 0x555df0bdeb40] mmco: unref short failure +[h264 @ 0x55d95b718d00] mmco: unref short failure +[h264 @ 0x55d95b718d00] mmco: unref short failure +[h264 @ 0x55d95b718d00] mmco: unref short failure +[h264 @ 0x555df0bdeb40] mmco: unref short failure +[h264 @ 0x55d95b718d00] mmco: unref short failure +[h264 @ 0x555df0bdeb40] mmco: unref short failure +[h264 @ 0x55d95b718d00] mmco: unref short failure +[h264 @ 0x55d95b718d00] mmco: unref short failure +[h264 @ 0x555df0bdeb40] mmco: unref short failure +[h264 @ 0x555df0bdeb40] mmco: unref short failure + [2024-11-27 18:19:41] iteration 210/ 1000 | consumed samples: 13440 | elapsed time per iteration (ms): 83384.6 | throughput per GPU (TFLOP/s/GPU): 92.4 | learning rate: 4.595329E-06 | global batch size: 64 | lm loss: 7.088625E-01 | loss scale: 1.0 | grad norm: 0.888 | number of skipped iterations: 0 | number of nan iterations: 0 | + [2024-11-27 18:21:22] iteration 211/ 1000 | consumed samples: 13504 | elapsed time per iteration (ms): 100503.4 | throughput per GPU (TFLOP/s/GPU): 76.7 | learning rate: 4.590950E-06 | global batch size: 64 | lm loss: 6.751691E-01 | loss scale: 1.0 | grad norm: 0.915 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x55d956b88f40] mmco: unref short failure +[h264 @ 0x555dece9f180] mmco: unref short failure +[h264 @ 0x55d956defa80] mmco: unref short failure +[h264 @ 0x55d956defa80] mmco: unref short failure +[h264 @ 0x555dedfc6580] mmco: unref short failure +[h264 @ 0x555dedfc6580] mmco: unref short failure +[h264 @ 0x555dedfc6580] mmco: unref short failure +[h264 @ 0x555dedfc6580] mmco: unref short failure +[h264 @ 0x55d956defa80] mmco: unref short failure +[h264 @ 0x55d956defa80] mmco: unref short failure +[h264 @ 0x55d957fbfd40] mmco: unref short failure +[h264 @ 0x55d957fbfd40] mmco: unref short failure +[h264 @ 0x555df4961e80] mmco: unref short failure +[h264 @ 0x555df4961e80] mmco: unref short failure +[h264 @ 0x555df4961e80] mmco: unref short failure +[h264 @ 0x555df4961e80] mmco: unref short failure +[h264 @ 0x55d957f0f640] mmco: unref short failure +[h264 @ 0x55d957f0f640] mmco: unref short failure +[h264 @ 0x555df32c1f40] mmco: unref short failure +[h264 @ 0x55d956da5240] mmco: unref short failure + [2024-11-27 18:22:53] iteration 212/ 1000 | consumed samples: 13568 | elapsed time per iteration (ms): 90839.6 | throughput per GPU (TFLOP/s/GPU): 84.9 | learning rate: 4.586549E-06 | global batch size: 64 | lm loss: 7.898833E-01 | loss scale: 1.0 | grad norm: 1.115 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x55d957f0f640] mmco: unref short failure +[h264 @ 0x55d957f0f640] mmco: unref short failure +[h264 @ 0x555df4961e80] mmco: unref short failure +[h264 @ 0x555df4961e80] mmco: unref short failure +[h264 @ 0x55d956b88f40] mmco: unref short failure +[h264 @ 0x55d956b88f40] mmco: unref short failure +[h264 @ 0x555def216fc0] mmco: unref short failure +[h264 @ 0x555def216fc0] mmco: unref short failure +[h264 @ 0x55d956b88f40] mmco: unref short failure +[h264 @ 0x55d956b88f40] mmco: unref short failure +[h264 @ 0x555def216fc0] mmco: unref short failure +[h264 @ 0x555def216fc0] mmco: unref short failure +[h264 @ 0x55d956b88f40] mmco: unref short failure +[h264 @ 0x555def216fc0] mmco: unref short failure +[h264 @ 0x55d956b88f40] mmco: unref short failure +[h264 @ 0x55d956b88f40] mmco: unref short failure +[h264 @ 0x555def216fc0] mmco: unref short failure +[h264 @ 0x555def216fc0] mmco: unref short failure +[h264 @ 0x55d956b88f40] mmco: unref short failure +[h264 @ 0x555def216fc0] mmco: unref short failure +[h264 @ 0x55d956b88f40] mmco: unref short failure +[h264 @ 0x555def216fc0] mmco: unref short failure +[h264 @ 0x55d956b88f40] mmco: unref short failure +[h264 @ 0x555def216fc0] mmco: unref short failure +[h264 @ 0x55d956b88f40] mmco: unref short failure +[h264 @ 0x55d956b88f40] mmco: unref short failure +[h264 @ 0x555def216fc0] mmco: unref short failure +[h264 @ 0x555def216fc0] mmco: unref short failure + [2024-11-27 18:24:07] iteration 213/ 1000 | consumed samples: 13632 | elapsed time per iteration (ms): 74153.9 | throughput per GPU (TFLOP/s/GPU): 104.0 | learning rate: 4.582128E-06 | global batch size: 64 | lm loss: 6.275224E-01 | loss scale: 1.0 | grad norm: 2.603 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x55d957fe7780] mmco: unref short failure +[h264 @ 0x555dee6ec240] mmco: unref short failure +[h264 @ 0x55d957fe7780] mmco: unref short failure +[h264 @ 0x555dee6ec240] mmco: unref short failure +[h264 @ 0x55d957fe7780] mmco: unref short failure +[h264 @ 0x55d957fe7780] mmco: unref short failure +[h264 @ 0x555dee6ec240] mmco: unref short failure +[h264 @ 0x555dee6ec240] mmco: unref short failure +[h264 @ 0x55d957fe7780] mmco: unref short failure +[h264 @ 0x55d957fe7780] mmco: unref short failure +[h264 @ 0x55d957fe7780] mmco: unref short failure +[h264 @ 0x55d957fe7780] mmco: unref short failure +[h264 @ 0x555dee6ec240] mmco: unref short failure +[h264 @ 0x555dee6ec240] mmco: unref short failure +[h264 @ 0x555dee6ec240] mmco: unref short failure +[h264 @ 0x555dee6ec240] mmco: unref short failure +[h264 @ 0x555dee6ec240] mmco: unref short failure +[h264 @ 0x555df32c1f40] mmco: unref short failure +[h264 @ 0x555df32c1f40] mmco: unref short failure +[h264 @ 0x55d957fe7780] mmco: unref short failure +[h264 @ 0x55d956da5240] mmco: unref short failure +[h264 @ 0x55d956da5240] mmco: unref short failure +[h264 @ 0x555df32c1f40] mmco: unref short failure +[h264 @ 0x555df32c1f40] mmco: unref short failure +[h264 @ 0x555df32c1f40] mmco: unref short failure +[h264 @ 0x55d956da5240] mmco: unref short failure +[h264 @ 0x55d956da5240] mmco: unref short failure +[h264 @ 0x55d956da5240] mmco: unref short failure + [2024-11-27 18:25:25] iteration 214/ 1000 | consumed samples: 13696 | elapsed time per iteration (ms): 77891.6 | throughput per GPU (TFLOP/s/GPU): 99.0 | learning rate: 4.577684E-06 | global batch size: 64 | lm loss: 7.842699E-01 | loss scale: 1.0 | grad norm: 1.162 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x55d956f98f80] mmco: unref short failure +[h264 @ 0x555dedfc6580] mmco: unref short failure +[h264 @ 0x55d957fe7780] mmco: unref short failure +[h264 @ 0x55d957fe7780] mmco: unref short failure +[h264 @ 0x555dee6ec240] mmco: unref short failure +[h264 @ 0x555dee6ec240] mmco: unref short failure +[h264 @ 0x55d957fe7780] mmco: unref short failure +[h264 @ 0x55d957fe7780] mmco: unref short failure +[h264 @ 0x555dee6ec240] mmco: unref short failure +[h264 @ 0x555dee6ec240] mmco: unref short failure +[h264 @ 0x55d957fe7780] mmco: unref short failure +[h264 @ 0x555dee6ec240] mmco: unref short failure +[h264 @ 0x555dec836c00] mmco: unref short failure +[h264 @ 0x55d9574d90c0] mmco: unref short failure + [2024-11-27 18:26:54] iteration 215/ 1000 | consumed samples: 13760 | elapsed time per iteration (ms): 89511.7 | throughput per GPU (TFLOP/s/GPU): 86.1 | learning rate: 4.573220E-06 | global batch size: 64 | lm loss: 7.736768E-01 | loss scale: 1.0 | grad norm: 0.939 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555dedfc6580] mmco: unref short failure +[h264 @ 0x55d956f36840] mmco: unref short failure +[h264 @ 0x555def216fc0] mmco: unref short failure +[h264 @ 0x555def216fc0] mmco: unref short failure +[h264 @ 0x55d956b88f40] mmco: unref short failure +[h264 @ 0x55d956b88f40] mmco: unref short failure + [2024-11-27 18:28:37] iteration 216/ 1000 | consumed samples: 13824 | elapsed time per iteration (ms): 102352.8 | throughput per GPU (TFLOP/s/GPU): 75.3 | learning rate: 4.568735E-06 | global batch size: 64 | lm loss: 6.585745E-01 | loss scale: 1.0 | grad norm: 0.947 | number of skipped iterations: 0 | number of nan iterations: 0 | + [2024-11-27 18:30:21] iteration 217/ 1000 | consumed samples: 13888 | elapsed time per iteration (ms): 103961.8 | throughput per GPU (TFLOP/s/GPU): 74.1 | learning rate: 4.564228E-06 | global batch size: 64 | lm loss: 6.783351E-01 | loss scale: 1.0 | grad norm: 0.830 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x55d9559dfc40] mmco: unref short failure +[h264 @ 0x555df4961e80] mmco: unref short failure +[h264 @ 0x55d956b88f40] mmco: unref short failure +[h264 @ 0x555def216fc0] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x55d957a03080] mmco: unref short failure +[h264 @ 0x55d957a03080] mmco: unref short failure +[h264 @ 0x55d956b88f40] mmco: unref short failure +[h264 @ 0x555def216fc0] mmco: unref short failure +[h264 @ 0x55d956b88f40] mmco: unref short failure +[h264 @ 0x55d956b88f40] mmco: unref short failure +[h264 @ 0x555def216fc0] mmco: unref short failure +[h264 @ 0x555def216fc0] mmco: unref short failure +[h264 @ 0x555df4961e80] mmco: unref short failure +[h264 @ 0x555df4961e80] mmco: unref short failure +[h264 @ 0x55d9559dfc40] mmco: unref short failure +[h264 @ 0x55d9559dfc40] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x55d957a03080] mmco: unref short failure +[h264 @ 0x55d957a03080] mmco: unref short failure +[h264 @ 0x55d956b88f40] mmco: unref short failure +[h264 @ 0x55d956b88f40] mmco: unref short failure +[h264 @ 0x555def216fc0] mmco: unref short failure +[h264 @ 0x555def216fc0] mmco: unref short failure +[h264 @ 0x555df4961e80] mmco: unref short failure +[h264 @ 0x55d956b88f40] mmco: unref short failure +[h264 @ 0x55d956b88f40] mmco: unref short failure +[h264 @ 0x55d9559dfc40] mmco: unref short failure +[h264 @ 0x555def216fc0] mmco: unref short failure +[h264 @ 0x555def216fc0] mmco: unref short failure +[h264 @ 0x55d956b88f40] mmco: unref short failure +[h264 @ 0x55d956b88f40] mmco: unref short failure +[h264 @ 0x555def216fc0] mmco: unref short failure +[h264 @ 0x555def216fc0] mmco: unref short failure +[h264 @ 0x55d956b88f40] mmco: unref short failure +[h264 @ 0x555def216fc0] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x55d956f36e40] mmco: unref short failure +[h264 @ 0x55d956f36e40] mmco: unref short failure + [2024-11-27 18:31:50] iteration 218/ 1000 | consumed samples: 13952 | elapsed time per iteration (ms): 89126.9 | throughput per GPU (TFLOP/s/GPU): 86.5 | learning rate: 4.559700E-06 | global batch size: 64 | lm loss: 6.602423E-01 | loss scale: 1.0 | grad norm: 0.844 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555def216fc0] mmco: unref short failure +[h264 @ 0x555def216fc0] mmco: unref short failure +[h264 @ 0x55d956b88f40] mmco: unref short failure +[h264 @ 0x55d956b88f40] mmco: unref short failure +[h264 @ 0x555dedfc6580] mmco: unref short failure +[h264 @ 0x555dedfc6580] mmco: unref short failure +[h264 @ 0x55d956f36840] mmco: unref short failure +[h264 @ 0x55d956f36840] mmco: unref short failure +[h264 @ 0x555def216fc0] mmco: unref short failure +[h264 @ 0x555def216fc0] mmco: unref short failure +[h264 @ 0x55d956b88f40] mmco: unref short failure +[h264 @ 0x55d956b88f40] mmco: unref short failure +[h264 @ 0x555def216fc0] mmco: unref short failure +[h264 @ 0x555def216fc0] mmco: unref short failure +[h264 @ 0x55d956b88f40] mmco: unref short failure +[h264 @ 0x55d956b88f40] mmco: unref short failure +[h264 @ 0x555def216fc0] mmco: unref short failure +[h264 @ 0x55d956b88f40] mmco: unref short failure +[h264 @ 0x555def216fc0] mmco: unref short failure +[h264 @ 0x555def216fc0] mmco: unref short failure +[h264 @ 0x55d956b88f40] mmco: unref short failure +[h264 @ 0x55d956b88f40] mmco: unref short failure +[h264 @ 0x555def216fc0] mmco: unref short failure +[h264 @ 0x555def216fc0] mmco: unref short failure +[h264 @ 0x555dee100dc0] mmco: unref short failure +[h264 @ 0x555dee100dc0] mmco: unref short failure +[h264 @ 0x55d956b88f40] mmco: unref short failure +[h264 @ 0x55d956b88f40] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x555dee100dc0] mmco: unref short failure +[h264 @ 0x555dee100dc0] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure + [2024-11-27 18:33:10] iteration 219/ 1000 | consumed samples: 14016 | elapsed time per iteration (ms): 80601.5 | throughput per GPU (TFLOP/s/GPU): 95.6 | learning rate: 4.555151E-06 | global batch size: 64 | lm loss: 7.091999E-01 | loss scale: 1.0 | grad norm: 0.998 | number of skipped iterations: 0 | number of nan iterations: 0 | + [2024-11-27 18:34:33] iteration 220/ 1000 | consumed samples: 14080 | elapsed time per iteration (ms): 82413.8 | throughput per GPU (TFLOP/s/GPU): 93.5 | learning rate: 4.550581E-06 | global batch size: 64 | lm loss: 6.900014E-01 | loss scale: 1.0 | grad norm: 0.817 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x555defd60880] mmco: unref short failure + [2024-11-27 18:35:58] iteration 221/ 1000 | consumed samples: 14144 | elapsed time per iteration (ms): 84945.4 | throughput per GPU (TFLOP/s/GPU): 90.7 | learning rate: 4.545990E-06 | global batch size: 64 | lm loss: 7.463014E-01 | loss scale: 1.0 | grad norm: 0.862 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555defd60880] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x55d95873d100] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x55d956da5240] mmco: unref short failure +[h264 @ 0x55d956da5240] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x55d956da5240] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x55d956da5240] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x55d956da5240] mmco: unref short failure +[h264 @ 0x55d956da5240] mmco: unref short failure +[h264 @ 0x55d956da5240] mmco: unref short failure +[h264 @ 0x55d956da5240] mmco: unref short failure +[h264 @ 0x55d956da5240] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x55d956da5240] mmco: unref short failure + [2024-11-27 18:37:37] iteration 222/ 1000 | consumed samples: 14208 | elapsed time per iteration (ms): 99477.3 | throughput per GPU (TFLOP/s/GPU): 77.5 | learning rate: 4.541378E-06 | global batch size: 64 | lm loss: 7.625656E-01 | loss scale: 1.0 | grad norm: 0.876 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555dee100dc0] mmco: unref short failure +[h264 @ 0x55d956f98f80] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x555defd60880] mmco: unref short failure +[h264 @ 0x555defd60880] mmco: unref short failure +[h264 @ 0x55d956767080] mmco: unref short failure +[h264 @ 0x55d956767080] mmco: unref short failure +[h264 @ 0x555deda84c00] mmco: unref short failure +[h264 @ 0x555deda84c00] mmco: unref short failure + [2024-11-27 18:39:09] iteration 223/ 1000 | consumed samples: 14272 | elapsed time per iteration (ms): 91591.3 | throughput per GPU (TFLOP/s/GPU): 84.2 | learning rate: 4.536745E-06 | global batch size: 64 | lm loss: 6.243209E-01 | loss scale: 1.0 | grad norm: 1.036 | number of skipped iterations: 0 | number of nan iterations: 0 | + [2024-11-27 18:40:40] iteration 224/ 1000 | consumed samples: 14336 | elapsed time per iteration (ms): 90724.3 | throughput per GPU (TFLOP/s/GPU): 85.0 | learning rate: 4.532092E-06 | global batch size: 64 | lm loss: 6.749411E-01 | loss scale: 1.0 | grad norm: 0.834 | number of skipped iterations: 0 | number of nan iterations: 0 | + [2024-11-27 18:42:01] iteration 225/ 1000 | consumed samples: 14400 | elapsed time per iteration (ms): 81320.9 | throughput per GPU (TFLOP/s/GPU): 94.8 | learning rate: 4.527417E-06 | global batch size: 64 | lm loss: 6.387181E-01 | loss scale: 1.0 | grad norm: 0.752 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555deda84c00] mmco: unref short failure +[h264 @ 0x55d956767080] mmco: unref short failure +[h264 @ 0x555deda84c00] mmco: unref short failure +[h264 @ 0x555deda84c00] mmco: unref short failure +[h264 @ 0x55d956767080] mmco: unref short failure +[h264 @ 0x55d956767080] mmco: unref short failure +[h264 @ 0x555deda84c00] mmco: unref short failure +[h264 @ 0x55d956767080] mmco: unref short failure + [2024-11-27 18:43:53] iteration 226/ 1000 | consumed samples: 14464 | elapsed time per iteration (ms): 111897.3 | throughput per GPU (TFLOP/s/GPU): 68.9 | learning rate: 4.522722E-06 | global batch size: 64 | lm loss: 6.825730E-01 | loss scale: 1.0 | grad norm: 0.982 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555dee570b40] mmco: unref short failure +[h264 @ 0x55d95b718d00] mmco: unref short failure +[h264 @ 0x555dee570b40] mmco: unref short failure +[h264 @ 0x55d95b718d00] mmco: unref short failure +[h264 @ 0x55d95b718d00] mmco: unref short failure +[h264 @ 0x55d95b718d00] mmco: unref short failure +[h264 @ 0x555dee570b40] mmco: unref short failure +[h264 @ 0x555dee570b40] mmco: unref short failure +[h264 @ 0x555ded679600] mmco: unref short failure +[h264 @ 0x55d9567ebec0] mmco: unref short failure +[h264 @ 0x555dec5a5140] mmco: unref short failure +[h264 @ 0x555dec5a5140] mmco: unref short failure +[h264 @ 0x55d957fbfd40] mmco: unref short failure +[h264 @ 0x55d957fbfd40] mmco: unref short failure +[h264 @ 0x55d95b718d00] mmco: unref short failure +[h264 @ 0x555dee570b40] mmco: unref short failure +[h264 @ 0x55d95b718d00] mmco: unref short failure +[h264 @ 0x55d95b718d00] mmco: unref short failure +[h264 @ 0x555dee570b40] mmco: unref short failure +[h264 @ 0x555dee570b40] mmco: unref short failure +[h264 @ 0x555dee570b40] mmco: unref short failure +[h264 @ 0x55d95b718d00] mmco: unref short failure +[h264 @ 0x555dee570b40] mmco: unref short failure +[h264 @ 0x555dee570b40] mmco: unref short failure +[h264 @ 0x55d95b718d00] mmco: unref short failure +[h264 @ 0x55d95b718d00] mmco: unref short failure +[h264 @ 0x555dee570b40] mmco: unref short failure +[h264 @ 0x55d95b718d00] mmco: unref short failure +[h264 @ 0x55d956d792c0] mmco: unref short failure +[h264 @ 0x55d956d792c0] mmco: unref short failure +[h264 @ 0x555ded7e1040] mmco: unref short failure +[h264 @ 0x555ded7e1040] mmco: unref short failure +[h264 @ 0x55d956d792c0] mmco: unref short failure +[h264 @ 0x55d956d792c0] mmco: unref short failure +[h264 @ 0x55d956d792c0] mmco: unref short failure +[h264 @ 0x555ded7e1040] mmco: unref short failure +[h264 @ 0x555ded7e1040] mmco: unref short failure +[h264 @ 0x555ded7e1040] mmco: unref short failure + [2024-11-27 18:45:13] iteration 227/ 1000 | consumed samples: 14528 | elapsed time per iteration (ms): 79731.4 | throughput per GPU (TFLOP/s/GPU): 96.7 | learning rate: 4.518006E-06 | global batch size: 64 | lm loss: 6.488119E-01 | loss scale: 1.0 | grad norm: 0.870 | number of skipped iterations: 0 | number of nan iterations: 0 | + [2024-11-27 18:46:50] iteration 228/ 1000 | consumed samples: 14592 | elapsed time per iteration (ms): 97086.8 | throughput per GPU (TFLOP/s/GPU): 79.4 | learning rate: 4.513270E-06 | global batch size: 64 | lm loss: 6.314636E-01 | loss scale: 1.0 | grad norm: 1.432 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x555ded0cd540] mmco: unref short failure +[h264 @ 0x555ded0cd540] mmco: unref short failure +[h264 @ 0x555ded0cd540] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x55d956defa80] mmco: unref short failure +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x555ded0cd540] mmco: unref short failure +[h264 @ 0x555ded0cd540] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x555dec5a5140] mmco: unref short failure +[h264 @ 0x55d957fbfd40] mmco: unref short failure +[h264 @ 0x555dee570b40] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure + [2024-11-27 18:48:09] iteration 229/ 1000 | consumed samples: 14656 | elapsed time per iteration (ms): 79348.7 | throughput per GPU (TFLOP/s/GPU): 97.1 | learning rate: 4.508513E-06 | global batch size: 64 | lm loss: 6.365232E-01 | loss scale: 1.0 | grad norm: 0.864 | number of skipped iterations: 0 | number of nan iterations: 0 | + [2024-11-27 18:50:58] iteration 230/ 1000 | consumed samples: 14720 | elapsed time per iteration (ms): 168816.7 | throughput per GPU (TFLOP/s/GPU): 45.7 | learning rate: 4.503735E-06 | global batch size: 64 | lm loss: 6.728038E-01 | loss scale: 1.0 | grad norm: 1.116 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x55d95b718d00] mmco: unref short failure +[h264 @ 0x555dee570b40] mmco: unref short failure +[h264 @ 0x555ded679600] mmco: unref short failure +[h264 @ 0x555ded679600] mmco: unref short failure +[h264 @ 0x55d957a03080] mmco: unref short failure +[h264 @ 0x55d957a03080] mmco: unref short failure +[h264 @ 0x55d957fbfd40] mmco: unref short failure +[h264 @ 0x555dec5a5140] mmco: unref short failure +[h264 @ 0x55d957fbfd40] mmco: unref short failure +[h264 @ 0x555dec5a5140] mmco: unref short failure +[h264 @ 0x55d957fbfd40] mmco: unref short failure +[h264 @ 0x555dec5a5140] mmco: unref short failure +[h264 @ 0x55d957fbfd40] mmco: unref short failure +[h264 @ 0x55d957fbfd40] mmco: unref short failure +[h264 @ 0x555dec5a5140] mmco: unref short failure +[h264 @ 0x555dec5a5140] mmco: unref short failure + [2024-11-27 18:52:17] iteration 231/ 1000 | consumed samples: 14784 | elapsed time per iteration (ms): 78665.1 | throughput per GPU (TFLOP/s/GPU): 98.0 | learning rate: 4.498937E-06 | global batch size: 64 | lm loss: 6.739997E-01 | loss scale: 1.0 | grad norm: 0.953 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x55d956f36840] mmco: unref short failure +[h264 @ 0x55d957fbfd40] mmco: unref short failure +[h264 @ 0x555dec5a5140] mmco: unref short failure +[h264 @ 0x55d957fbfd40] mmco: unref short failure +[h264 @ 0x55d957fbfd40] mmco: unref short failure +[h264 @ 0x555dec5a5140] mmco: unref short failure +[h264 @ 0x555dec5a5140] mmco: unref short failure +[h264 @ 0x555def251b40] mmco: unref short failure +[h264 @ 0x555def251b40] mmco: unref short failure +[h264 @ 0x555def251b40] mmco: unref short failure +[h264 @ 0x555def251b40] mmco: unref short failure +[h264 @ 0x55d956d792c0] mmco: unref short failure +[h264 @ 0x55d956d792c0] mmco: unref short failure +[h264 @ 0x55d956d792c0] mmco: unref short failure +[h264 @ 0x55d956d792c0] mmco: unref short failure +[h264 @ 0x555dee570b40] mmco: unref short failure +[h264 @ 0x55d95b718d00] mmco: unref short failure +[h264 @ 0x555deda84c00] mmco: unref short failure +[h264 @ 0x555deda84c00] mmco: unref short failure +[h264 @ 0x555deda84c00] mmco: unref short failure +[h264 @ 0x55d9575fd4c0] mmco: unref short failure +[h264 @ 0x55d9575fd4c0] mmco: unref short failure +[h264 @ 0x55d9575fd4c0] mmco: unref short failure + [2024-11-27 18:54:14] iteration 232/ 1000 | consumed samples: 14848 | elapsed time per iteration (ms): 117310.5 | throughput per GPU (TFLOP/s/GPU): 65.7 | learning rate: 4.494118E-06 | global batch size: 64 | lm loss: 6.592008E-01 | loss scale: 1.0 | grad norm: 1.044 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x55d956f36840] mmco: unref short failure +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x55d956f36840] mmco: unref short failure +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x55d956f36840] mmco: unref short failure +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x55d956f36840] mmco: unref short failure +[h264 @ 0x55d956f36840] mmco: unref short failure +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x55d95b718d00] mmco: unref short failure +[h264 @ 0x555dee570b40] mmco: unref short failure +[h264 @ 0x55d95b718d00] mmco: unref short failure +[h264 @ 0x555dee570b40] mmco: unref short failure +[h264 @ 0x55d95b718d00] mmco: unref short failure +[h264 @ 0x55d95b718d00] mmco: unref short failure +[h264 @ 0x555dee570b40] mmco: unref short failure +[h264 @ 0x555dee570b40] mmco: unref short failure +[h264 @ 0x55d95b718d00] mmco: unref short failure +[h264 @ 0x555dee570b40] mmco: unref short failure +[h264 @ 0x55d95b718d00] mmco: unref short failure +[h264 @ 0x555dee570b40] mmco: unref short failure + [2024-11-27 18:55:38] iteration 233/ 1000 | consumed samples: 14912 | elapsed time per iteration (ms): 84354.7 | throughput per GPU (TFLOP/s/GPU): 91.4 | learning rate: 4.489279E-06 | global batch size: 64 | lm loss: 6.571340E-01 | loss scale: 1.0 | grad norm: 0.906 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x55d95b718d00] mmco: unref short failure +[h264 @ 0x555dee570b40] mmco: unref short failure +[h264 @ 0x55d95b718d00] mmco: unref short failure +[h264 @ 0x55d95b718d00] mmco: unref short failure +[h264 @ 0x555dee570b40] mmco: unref short failure +[h264 @ 0x555dee570b40] mmco: unref short failure + [2024-11-27 18:57:01] iteration 234/ 1000 | consumed samples: 14976 | elapsed time per iteration (ms): 82360.2 | throughput per GPU (TFLOP/s/GPU): 93.6 | learning rate: 4.484420E-06 | global batch size: 64 | lm loss: 6.976779E-01 | loss scale: 1.0 | grad norm: 0.813 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555dec836c00] mmco: unref short failure +[h264 @ 0x555dec836c00] mmco: unref short failure +[h264 @ 0x55d957fe7780] mmco: unref short failure +[h264 @ 0x55d957fe7780] mmco: unref short failure +[h264 @ 0x55d956f36840] mmco: unref short failure +[h264 @ 0x55d956f36840] mmco: unref short failure +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x55d957fbfd40] mmco: unref short failure +[h264 @ 0x55d957fbfd40] mmco: unref short failure +[h264 @ 0x555dec5a5140] mmco: unref short failure +[h264 @ 0x555dec5a5140] mmco: unref short failure +[h264 @ 0x555def251b40] mmco: unref short failure +[h264 @ 0x55d9575fd4c0] mmco: unref short failure + [2024-11-27 18:58:50] iteration 235/ 1000 | consumed samples: 15040 | elapsed time per iteration (ms): 109720.4 | throughput per GPU (TFLOP/s/GPU): 70.3 | learning rate: 4.479540E-06 | global batch size: 64 | lm loss: 7.036162E-01 | loss scale: 1.0 | grad norm: 0.975 | number of skipped iterations: 0 | number of nan iterations: 0 | + [2024-11-27 19:00:08] iteration 236/ 1000 | consumed samples: 15104 | elapsed time per iteration (ms): 77186.6 | throughput per GPU (TFLOP/s/GPU): 99.9 | learning rate: 4.474640E-06 | global batch size: 64 | lm loss: 6.882964E-01 | loss scale: 1.0 | grad norm: 0.955 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x55d956f36840] mmco: unref short failure +[h264 @ 0x55d956f36840] mmco: unref short failure +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x55d956f36840] mmco: unref short failure +[h264 @ 0x55d956f36840] mmco: unref short failure +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x55d956f36840] mmco: unref short failure +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x55d956f36840] mmco: unref short failure + [2024-11-27 19:01:30] iteration 237/ 1000 | consumed samples: 15168 | elapsed time per iteration (ms): 82633.5 | throughput per GPU (TFLOP/s/GPU): 93.3 | learning rate: 4.469720E-06 | global batch size: 64 | lm loss: 6.796645E-01 | loss scale: 1.0 | grad norm: 0.973 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x55d95b718d00] mmco: unref short failure +[h264 @ 0x555dee570b40] mmco: unref short failure +[h264 @ 0x55d95b718d00] mmco: unref short failure +[h264 @ 0x55d95b718d00] mmco: unref short failure +[h264 @ 0x555dee570b40] mmco: unref short failure +[h264 @ 0x555dee570b40] mmco: unref short failure +[h264 @ 0x55d95b718d00] mmco: unref short failure +[h264 @ 0x55d95b718d00] mmco: unref short failure +[h264 @ 0x555dee570b40] mmco: unref short failure +[h264 @ 0x555dee570b40] mmco: unref short failure +[h264 @ 0x555dec836c00] mmco: unref short failure +[h264 @ 0x555dec836c00] mmco: unref short failure +[h264 @ 0x55d957fe7780] mmco: unref short failure +[h264 @ 0x55d957fe7780] mmco: unref short failure +[h264 @ 0x555dec836c00] mmco: unref short failure +[h264 @ 0x55d957fe7780] mmco: unref short failure +[h264 @ 0x555dec836c00] mmco: unref short failure +[h264 @ 0x55d957fe7780] mmco: unref short failure + [2024-11-27 19:02:50] iteration 238/ 1000 | consumed samples: 15232 | elapsed time per iteration (ms): 79756.3 | throughput per GPU (TFLOP/s/GPU): 96.7 | learning rate: 4.464780E-06 | global batch size: 64 | lm loss: 6.988075E-01 | loss scale: 1.0 | grad norm: 1.011 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x55d956f36840] mmco: unref short failure +[h264 @ 0x55d956f36840] mmco: unref short failure +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x55d956f36840] mmco: unref short failure +[h264 @ 0x55d956f36840] mmco: unref short failure +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x55d956f36840] mmco: unref short failure +[h264 @ 0x55d956f36840] mmco: unref short failure + [2024-11-27 19:04:03] iteration 239/ 1000 | consumed samples: 15296 | elapsed time per iteration (ms): 73122.9 | throughput per GPU (TFLOP/s/GPU): 105.4 | learning rate: 4.459820E-06 | global batch size: 64 | lm loss: 6.563836E-01 | loss scale: 1.0 | grad norm: 0.945 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555dec5a5140] mmco: unref short failure +[h264 @ 0x555df1cb0600] mmco: unref short failure +[h264 @ 0x55d957fbfd40] mmco: unref short failure +[h264 @ 0x55d95873d100] mmco: unref short failure +[h264 @ 0x555df1cb0600] mmco: unref short failure +[h264 @ 0x55d95873d100] mmco: unref short failure +[h264 @ 0x555df1cb0600] mmco: unref short failure +[h264 @ 0x555df1cb0600] mmco: unref short failure +[h264 @ 0x55d95873d100] mmco: unref short failure +[h264 @ 0x55d95873d100] mmco: unref short failure + [2024-11-27 19:05:27] iteration 240/ 1000 | consumed samples: 15360 | elapsed time per iteration (ms): 83360.8 | throughput per GPU (TFLOP/s/GPU): 92.5 | learning rate: 4.454840E-06 | global batch size: 64 | lm loss: 7.006662E-01 | loss scale: 1.0 | grad norm: 0.873 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555dec5a5140] mmco: unref short failure +[h264 @ 0x55d957fbfd40] mmco: unref short failure +[h264 @ 0x555df4961e80] mmco: unref short failure +[h264 @ 0x55d9586ad200] mmco: unref short failure + [2024-11-27 19:06:56] iteration 241/ 1000 | consumed samples: 15424 | elapsed time per iteration (ms): 89125.8 | throughput per GPU (TFLOP/s/GPU): 86.5 | learning rate: 4.449839E-06 | global batch size: 64 | lm loss: 7.014512E-01 | loss scale: 1.0 | grad norm: 1.015 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555deee52400] mmco: unref short failure +[h264 @ 0x55d956b9c3c0] mmco: unref short failure +[h264 @ 0x555def77b080] mmco: unref short failure +[h264 @ 0x55d956fbbd80] mmco: unref short failure +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x55d956f36840] mmco: unref short failure +[h264 @ 0x555dec9973c0] Missing reference picture, default is 65542 +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x55d956f36840] Missing reference picture, default is 65542 +[h264 @ 0x55d956f36840] mmco: unref short failure +[h264 @ 0x55d956f36840] mmco: unref short failure + [2024-11-27 19:08:15] iteration 242/ 1000 | consumed samples: 15488 | elapsed time per iteration (ms): 79642.5 | throughput per GPU (TFLOP/s/GPU): 96.8 | learning rate: 4.444819E-06 | global batch size: 64 | lm loss: 6.564631E-01 | loss scale: 1.0 | grad norm: 0.756 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555dee792700] mmco: unref short failure +[h264 @ 0x555dee792700] mmco: unref short failure +[h264 @ 0x55d956386f00] mmco: unref short failure +[h264 @ 0x55d956386f00] mmco: unref short failure +[h264 @ 0x555dee792700] mmco: unref short failure +[h264 @ 0x555dee792700] mmco: unref short failure +[h264 @ 0x55d956386f00] mmco: unref short failure +[h264 @ 0x55d956386f00] mmco: unref short failure + [2024-11-27 19:09:39] iteration 243/ 1000 | consumed samples: 15552 | elapsed time per iteration (ms): 83777.7 | throughput per GPU (TFLOP/s/GPU): 92.0 | learning rate: 4.439779E-06 | global batch size: 64 | lm loss: 7.187881E-01 | loss scale: 1.0 | grad norm: 1.296 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x55d956fbbd80] mmco: unref short failure +[h264 @ 0x555def216fc0] mmco: unref short failure +[h264 @ 0x55d956fbbd80] mmco: unref short failure +[h264 @ 0x555def216fc0] mmco: unref short failure +[h264 @ 0x55d956fbbd80] mmco: unref short failure +[h264 @ 0x555def216fc0] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x55d957fbfd40] mmco: unref short failure +[h264 @ 0x55d957fbfd40] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x55d957fbfd40] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x55d9559dfc40] mmco: unref short failure + [2024-11-27 19:10:56] iteration 244/ 1000 | consumed samples: 15616 | elapsed time per iteration (ms): 76634.3 | throughput per GPU (TFLOP/s/GPU): 100.6 | learning rate: 4.434719E-06 | global batch size: 64 | lm loss: 6.608657E-01 | loss scale: 1.0 | grad norm: 0.865 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555dedfc6580] mmco: unref short failure +[h264 @ 0x55d956d792c0] mmco: unref short failure +[h264 @ 0x555dedfc6580] mmco: unref short failure +[h264 @ 0x555dedfc6580] mmco: unref short failure +[h264 @ 0x55d956d792c0] mmco: unref short failure +[h264 @ 0x55d956d792c0] mmco: unref short failure +[h264 @ 0x55d956b88f40] mmco: unref short failure +[h264 @ 0x555dee792700] mmco: unref short failure + [2024-11-27 19:12:14] iteration 245/ 1000 | consumed samples: 15680 | elapsed time per iteration (ms): 77894.3 | throughput per GPU (TFLOP/s/GPU): 99.0 | learning rate: 4.429639E-06 | global batch size: 64 | lm loss: 6.903512E-01 | loss scale: 1.0 | grad norm: 0.888 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x55d958e05fc0] mmco: unref short failure +[h264 @ 0x55d9574d90c0] mmco: unref short failure +[h264 @ 0x555df3e6f800] mmco: unref short failure +[h264 @ 0x555dece9f180] mmco: unref short failure +[h264 @ 0x555dece9f180] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x555dedfc6580] mmco: unref short failure +[h264 @ 0x55d956f537c0] mmco: unref short failure + [2024-11-27 19:13:50] iteration 246/ 1000 | consumed samples: 15744 | elapsed time per iteration (ms): 96295.0 | throughput per GPU (TFLOP/s/GPU): 80.1 | learning rate: 4.424540E-06 | global batch size: 64 | lm loss: 6.512361E-01 | loss scale: 1.0 | grad norm: 0.934 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x555dece9f180] mmco: unref short failure +[h264 @ 0x555dece9f180] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x555dece9f180] mmco: unref short failure +[h264 @ 0x555dece9f180] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x555dece9f180] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x555dece9f180] mmco: unref short failure +[h264 @ 0x555dece9f180] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x55d956da5240] mmco: unref short failure +[h264 @ 0x55d956da5240] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x55d956da5240] mmco: unref short failure +[h264 @ 0x55d956da5240] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x55d956da5240] mmco: unref short failure +[h264 @ 0x55d956da5240] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x55d956da5240] mmco: unref short failure +[h264 @ 0x55d956da5240] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x55d956da5240] mmco: unref short failure +[h264 @ 0x55d956da5240] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x55d956da5240] mmco: unref short failure +[h264 @ 0x555dedfc6580] mmco: unref short failure +[h264 @ 0x555dedfc6580] mmco: unref short failure +[h264 @ 0x55d9575fd4c0] mmco: unref short failure +[h264 @ 0x55d9575fd4c0] mmco: unref short failure +[h264 @ 0x555dedfc6580] mmco: unref short failure +[h264 @ 0x555dedfc6580] mmco: unref short failure +[h264 @ 0x55d9575fd4c0] mmco: unref short failure +[h264 @ 0x55d9575fd4c0] mmco: unref short failure + [2024-11-27 19:14:55] iteration 247/ 1000 | consumed samples: 15808 | elapsed time per iteration (ms): 64734.1 | throughput per GPU (TFLOP/s/GPU): 119.1 | learning rate: 4.419421E-06 | global batch size: 64 | lm loss: 6.166883E-01 | loss scale: 1.0 | grad norm: 0.891 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x55d956fbbd80] mmco: unref short failure +[h264 @ 0x55d956fbbd80] mmco: unref short failure +[h264 @ 0x555def216fc0] mmco: unref short failure +[h264 @ 0x555def216fc0] mmco: unref short failure +[h264 @ 0x555def216fc0] mmco: unref short failure +[h264 @ 0x55d956fbbd80] mmco: unref short failure + [2024-11-27 19:16:25] iteration 248/ 1000 | consumed samples: 15872 | elapsed time per iteration (ms): 90164.6 | throughput per GPU (TFLOP/s/GPU): 85.5 | learning rate: 4.414282E-06 | global batch size: 64 | lm loss: 6.597496E-01 | loss scale: 1.0 | grad norm: 0.899 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555def216fc0] mmco: unref short failure +[h264 @ 0x55d956fbbd80] mmco: unref short failure +[h264 @ 0x555dedfc6580] mmco: unref short failure +[h264 @ 0x555dedfc6580] mmco: unref short failure +[h264 @ 0x555dedfc6580] mmco: unref short failure +[h264 @ 0x555dedfc6580] mmco: unref short failure +[h264 @ 0x55d956b88f40] mmco: unref short failure +[h264 @ 0x55d956b88f40] mmco: unref short failure +[h264 @ 0x55d956b88f40] mmco: unref short failure +[h264 @ 0x55d956b88f40] mmco: unref short failure +[h264 @ 0x555dece9f180] mmco: unref short failure +[h264 @ 0x555dece9f180] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x55d9586ad200] mmco: unref short failure +[h264 @ 0x555dece9f180] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure + [2024-11-27 19:18:02] iteration 249/ 1000 | consumed samples: 15936 | elapsed time per iteration (ms): 96840.0 | throughput per GPU (TFLOP/s/GPU): 79.6 | learning rate: 4.409124E-06 | global batch size: 64 | lm loss: 6.686405E-01 | loss scale: 1.0 | grad norm: 0.924 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x55d956b88f40] mmco: unref short failure +[h264 @ 0x555dedfc6580] mmco: unref short failure + [2024-11-27 19:19:17] iteration 250/ 1000 | consumed samples: 16000 | elapsed time per iteration (ms): 74766.9 | throughput per GPU (TFLOP/s/GPU): 103.1 | learning rate: 4.403946E-06 | global batch size: 64 | lm loss: 6.316459E-01 | loss scale: 1.0 | grad norm: 0.856 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x55d956f7b580] mmco: unref short failure +[h264 @ 0x55d956f7b580] mmco: unref short failure +[h264 @ 0x55d956f7b580] mmco: unref short failure +[h264 @ 0x555dee700e80] mmco: unref short failure +[h264 @ 0x555dee700e80] mmco: unref short failure +[h264 @ 0x555dee700e80] mmco: unref short failure +[h264 @ 0x55d9574d90c0] mmco: unref short failure +[h264 @ 0x555df3e6f800] mmco: unref short failure +[h264 @ 0x55d9574d90c0] mmco: unref short failure +[h264 @ 0x55d9574d90c0] mmco: unref short failure +[h264 @ 0x555df3e6f800] mmco: unref short failure +[h264 @ 0x555df3e6f800] mmco: unref short failure +[h264 @ 0x55d9574d90c0] mmco: unref short failure +[h264 @ 0x55d9574d90c0] mmco: unref short failure +[h264 @ 0x555df3e6f800] mmco: unref short failure +[h264 @ 0x555df3e6f800] mmco: unref short failure +[h264 @ 0x55d9574d90c0] mmco: unref short failure +[h264 @ 0x555df3e6f800] mmco: unref short failure + [2024-11-27 19:20:51] iteration 251/ 1000 | consumed samples: 16064 | elapsed time per iteration (ms): 94801.2 | throughput per GPU (TFLOP/s/GPU): 81.3 | learning rate: 4.398749E-06 | global batch size: 64 | lm loss: 6.754164E-01 | loss scale: 1.0 | grad norm: 1.007 | number of skipped iterations: 0 | number of nan iterations: 0 | + [2024-11-27 19:22:08] iteration 252/ 1000 | consumed samples: 16128 | elapsed time per iteration (ms): 76512.8 | throughput per GPU (TFLOP/s/GPU): 100.7 | learning rate: 4.393533E-06 | global batch size: 64 | lm loss: 7.113305E-01 | loss scale: 1.0 | grad norm: 0.834 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x55d956f36840] mmco: unref short failure +[h264 @ 0x55d956f36840] mmco: unref short failure +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x55d957f0f640] mmco: unref short failure +[h264 @ 0x55d957f0f640] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x55d956f36840] mmco: unref short failure +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x55d956f36840] mmco: unref short failure + [2024-11-27 19:23:32] iteration 253/ 1000 | consumed samples: 16192 | elapsed time per iteration (ms): 84219.0 | throughput per GPU (TFLOP/s/GPU): 91.5 | learning rate: 4.388297E-06 | global batch size: 64 | lm loss: 7.143953E-01 | loss scale: 1.0 | grad norm: 0.841 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x55d9559dfc40] mmco: unref short failure +[h264 @ 0x55d9559dfc40] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x55d9559dfc40] mmco: unref short failure +[h264 @ 0x55d9559dfc40] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x55d9559dfc40] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x55d9559dfc40] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x55d9559dfc40] mmco: unref short failure +[h264 @ 0x55d9559dfc40] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x55d9559dfc40] mmco: unref short failure +[h264 @ 0x55d9559dfc40] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x55d9559dfc40] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure + [2024-11-27 19:24:58] iteration 254/ 1000 | consumed samples: 16256 | elapsed time per iteration (ms): 85796.8 | throughput per GPU (TFLOP/s/GPU): 89.8 | learning rate: 4.383042E-06 | global batch size: 64 | lm loss: 6.647977E-01 | loss scale: 1.0 | grad norm: 0.926 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555dee700e80] mmco: unref short failure +[h264 @ 0x55d956f7b580] mmco: unref short failure +[h264 @ 0x555dec836c00] mmco: unref short failure +[h264 @ 0x555dec836c00] mmco: unref short failure +[h264 @ 0x55d956b88f40] mmco: unref short failure +[h264 @ 0x55d956b88f40] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x55d9559dfc40] mmco: unref short failure +[h264 @ 0x55d9559dfc40] mmco: unref short failure +[h264 @ 0x555dec836c00] mmco: unref short failure +[h264 @ 0x55d956b88f40] mmco: unref short failure +[h264 @ 0x555dee588f00] mmco: unref short failure +[h264 @ 0x555dee588f00] mmco: unref short failure +[h264 @ 0x555dee588f00] mmco: unref short failure +[h264 @ 0x555dee588f00] mmco: unref short failure +[h264 @ 0x55d9575fd4c0] mmco: unref short failure +[h264 @ 0x55d9575fd4c0] mmco: unref short failure +[h264 @ 0x55d9575fd4c0] mmco: unref short failure +[h264 @ 0x55d9575fd4c0] mmco: unref short failure +[h264 @ 0x555def216fc0] mmco: unref short failure +[h264 @ 0x55d956fbbd80] mmco: unref short failure + [2024-11-27 19:26:32] iteration 255/ 1000 | consumed samples: 16320 | elapsed time per iteration (ms): 94433.6 | throughput per GPU (TFLOP/s/GPU): 81.6 | learning rate: 4.377767E-06 | global batch size: 64 | lm loss: 7.061955E-01 | loss scale: 1.0 | grad norm: 1.360 | number of skipped iterations: 0 | number of nan iterations: 0 | + [2024-11-27 19:28:04] iteration 256/ 1000 | consumed samples: 16384 | elapsed time per iteration (ms): 91966.5 | throughput per GPU (TFLOP/s/GPU): 83.8 | learning rate: 4.372474E-06 | global batch size: 64 | lm loss: 6.892250E-01 | loss scale: 1.0 | grad norm: 0.869 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555dece9f180] mmco: unref short failure +[h264 @ 0x555dece9f180] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure + [2024-11-27 19:29:28] iteration 257/ 1000 | consumed samples: 16448 | elapsed time per iteration (ms): 83262.5 | throughput per GPU (TFLOP/s/GPU): 92.6 | learning rate: 4.367161E-06 | global batch size: 64 | lm loss: 6.549360E-01 | loss scale: 1.0 | grad norm: 0.877 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x55d957a03080] mmco: unref short failure +[h264 @ 0x55d957a03080] mmco: unref short failure +[h264 @ 0x555dee0f8a80] mmco: unref short failure +[h264 @ 0x555dee0f8a80] mmco: unref short failure +[h264 @ 0x555dee0f8a80] mmco: unref short failure +[h264 @ 0x555dee0f8a80] mmco: unref short failure +[h264 @ 0x55d957a03080] mmco: unref short failure +[h264 @ 0x55d957a03080] mmco: unref short failure +[h264 @ 0x555dee0f8a80] mmco: unref short failure +[h264 @ 0x55d957a03080] mmco: unref short failure + [2024-11-27 19:30:52] iteration 258/ 1000 | consumed samples: 16512 | elapsed time per iteration (ms): 84090.0 | throughput per GPU (TFLOP/s/GPU): 91.7 | learning rate: 4.361829E-06 | global batch size: 64 | lm loss: 6.244700E-01 | loss scale: 1.0 | grad norm: 0.748 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555dedf05880] mmco: unref short failure +[h264 @ 0x555dedf05880] mmco: unref short failure +[h264 @ 0x55d9581bc7c0] mmco: unref short failure +[h264 @ 0x55d9581bc7c0] mmco: unref short failure +[h264 @ 0x555dedf05880] mmco: unref short failure +[h264 @ 0x55d9581bc7c0] mmco: unref short failure +[h264 @ 0x555dece9f180] mmco: unref short failure +[h264 @ 0x555dece9f180] mmco: unref short failure +[h264 @ 0x555dece9f180] mmco: unref short failure +[h264 @ 0x555dece9f180] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x555dece9f180] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure + [2024-11-27 19:32:18] iteration 259/ 1000 | consumed samples: 16576 | elapsed time per iteration (ms): 85835.7 | throughput per GPU (TFLOP/s/GPU): 89.8 | learning rate: 4.356478E-06 | global batch size: 64 | lm loss: 7.172493E-01 | loss scale: 1.0 | grad norm: 0.888 | number of skipped iterations: 0 | number of nan iterations: 0 | + [2024-11-27 19:33:50] iteration 260/ 1000 | consumed samples: 16640 | elapsed time per iteration (ms): 92333.1 | throughput per GPU (TFLOP/s/GPU): 83.5 | learning rate: 4.351109E-06 | global batch size: 64 | lm loss: 6.703358E-01 | loss scale: 1.0 | grad norm: 1.093 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x55d956f98f80] mmco: unref short failure +[h264 @ 0x555dece9f180] mmco: unref short failure +[h264 @ 0x55d956f98f80] mmco: unref short failure +[h264 @ 0x55d956f98f80] mmco: unref short failure +[h264 @ 0x555dece9f180] mmco: unref short failure +[h264 @ 0x555dece9f180] mmco: unref short failure +[h264 @ 0x555deda84c00] mmco: unref short failure +[h264 @ 0x55d955d24640] mmco: unref short failure +[h264 @ 0x55d956f98f80] mmco: unref short failure +[h264 @ 0x555dece9f180] mmco: unref short failure +[h264 @ 0x555dee1d4480] mmco: unref short failure +[h264 @ 0x555dee1d4480] mmco: unref short failure +[h264 @ 0x55d957a1cec0] mmco: unref short failure +[h264 @ 0x55d957a1cec0] mmco: unref short failure +[h264 @ 0x555dece9f180] mmco: unref short failure +[h264 @ 0x555dece9f180] mmco: unref short failure +[h264 @ 0x55d956f98f80] mmco: unref short failure +[h264 @ 0x55d956f98f80] mmco: unref short failure +[h264 @ 0x555dece9f180] mmco: unref short failure +[h264 @ 0x55d956f98f80] mmco: unref short failure +[h264 @ 0x555dee1d4480] mmco: unref short failure +[h264 @ 0x55d957a1cec0] mmco: unref short failure +[h264 @ 0x555dece9f180] mmco: unref short failure +[h264 @ 0x555dece9f180] mmco: unref short failure +[h264 @ 0x55d956f98f80] mmco: unref short failure +[h264 @ 0x55d956f98f80] mmco: unref short failure +[h264 @ 0x555dee1d4480] mmco: unref short failure +[h264 @ 0x55d957a1cec0] mmco: unref short failure + [2024-11-27 19:35:19] iteration 261/ 1000 | consumed samples: 16704 | elapsed time per iteration (ms): 89282.0 | throughput per GPU (TFLOP/s/GPU): 86.3 | learning rate: 4.345720E-06 | global batch size: 64 | lm loss: 6.450242E-01 | loss scale: 1.0 | grad norm: 1.135 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x55d955b8bcc0] mmco: unref short failure +[h264 @ 0x55d955b8bcc0] mmco: unref short failure +[h264 @ 0x555dedf05880] mmco: unref short failure +[h264 @ 0x555dedf05880] mmco: unref short failure +[h264 @ 0x55d955b8bcc0] mmco: unref short failure +[h264 @ 0x55d955b8bcc0] mmco: unref short failure +[h264 @ 0x555dedf05880] mmco: unref short failure +[h264 @ 0x555dedf05880] mmco: unref short failure + [2024-11-27 19:36:41] iteration 262/ 1000 | consumed samples: 16768 | elapsed time per iteration (ms): 81865.5 | throughput per GPU (TFLOP/s/GPU): 94.2 | learning rate: 4.340313E-06 | global batch size: 64 | lm loss: 6.914611E-01 | loss scale: 1.0 | grad norm: 0.892 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x55d955b8bcc0] mmco: unref short failure +[h264 @ 0x555dedf05880] mmco: unref short failure +[h264 @ 0x55d959b34680] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x55d959b34680] mmco: unref short failure +[h264 @ 0x55d959b34680] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x555ded1fff00] mmco: unref short failure +[h264 @ 0x555ded1fff00] mmco: unref short failure +[h264 @ 0x55d955b8bcc0] mmco: unref short failure +[h264 @ 0x55d955b8bcc0] mmco: unref short failure +[h264 @ 0x555dece9f180] mmco: unref short failure +[h264 @ 0x555dece9f180] mmco: unref short failure +[h264 @ 0x555ded1fff00] mmco: unref short failure +[h264 @ 0x55d955b8bcc0] mmco: unref short failure +[h264 @ 0x555dece9f180] mmco: unref short failure +[h264 @ 0x555dece9f180] mmco: unref short failure +[h264 @ 0x55d956f98f80] mmco: unref short failure +[h264 @ 0x55d956f98f80] mmco: unref short failure +[h264 @ 0x55d956f98f80] mmco: unref short failure +[h264 @ 0x55d956f98f80] mmco: unref short failure +[h264 @ 0x555dece9f180] mmco: unref short failure +[h264 @ 0x555dece9f180] mmco: unref short failure +[h264 @ 0x55d956f98f80] mmco: unref short failure +[h264 @ 0x55d956f98f80] mmco: unref short failure + [2024-11-27 19:38:21] iteration 263/ 1000 | consumed samples: 16832 | elapsed time per iteration (ms): 100170.0 | throughput per GPU (TFLOP/s/GPU): 77.0 | learning rate: 4.334886E-06 | global batch size: 64 | lm loss: 6.484630E-01 | loss scale: 1.0 | grad norm: 0.873 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555df0f7cd00] mmco: unref short failure +[h264 @ 0x555df0f7cd00] mmco: unref short failure +[h264 @ 0x55d956f36840] mmco: unref short failure +[h264 @ 0x55d956f36840] mmco: unref short failure +[h264 @ 0x555dece9f180] mmco: unref short failure +[h264 @ 0x555dece9f180] mmco: unref short failure +[h264 @ 0x55d956f98f80] mmco: unref short failure +[h264 @ 0x55d956f98f80] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x55d959b34680] mmco: unref short failure +[h264 @ 0x55d959b34680] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x55d959b34680] mmco: unref short failure + [2024-11-27 19:39:52] iteration 264/ 1000 | consumed samples: 16896 | elapsed time per iteration (ms): 90530.8 | throughput per GPU (TFLOP/s/GPU): 85.1 | learning rate: 4.329441E-06 | global batch size: 64 | lm loss: 6.836088E-01 | loss scale: 1.0 | grad norm: 0.978 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555deda84c00] mmco: unref short failure +[h264 @ 0x555deda84c00] mmco: unref short failure +[h264 @ 0x55d955d24640] mmco: unref short failure +[h264 @ 0x55d955d24640] mmco: unref short failure +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x55d95707f900] mmco: unref short failure +[h264 @ 0x55d959b34680] mmco: unref short failure +[h264 @ 0x55d959b34680] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x55d959b34680] mmco: unref short failure +[h264 @ 0x55d959b34680] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x555df32c1f40] mmco: unref short failure +[h264 @ 0x55d956f36840] mmco: unref short failure +[h264 @ 0x55d956ee5780] mmco: unref short failure +[h264 @ 0x555df0f7cd00] mmco: unref short failure +[h264 @ 0x555deda84c00] mmco: unref short failure +[h264 @ 0x555df1cb0600] mmco: unref short failure +[h264 @ 0x555df1cb0600] mmco: unref short failure +[h264 @ 0x55d955b8bcc0] mmco: unref short failure +[h264 @ 0x55d955b8bcc0] mmco: unref short failure + [2024-11-27 19:42:18] iteration 265/ 1000 | consumed samples: 16960 | elapsed time per iteration (ms): 146045.8 | throughput per GPU (TFLOP/s/GPU): 52.8 | learning rate: 4.323978E-06 | global batch size: 64 | lm loss: 6.735452E-01 | loss scale: 1.0 | grad norm: 0.893 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x55d95707f900] mmco: unref short failure +[h264 @ 0x55d95707f900] mmco: unref short failure +[h264 @ 0x555df32c1f40] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x555df32c1f40] mmco: unref short failure +[h264 @ 0x555df32c1f40] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x555dee1d4480] mmco: unref short failure +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x555df0f7cd00] mmco: unref short failure +[h264 @ 0x555df0f7cd00] mmco: unref short failure +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x55d95707f900] mmco: unref short failure +[h264 @ 0x55d957a1cec0] mmco: unref short failure +[h264 @ 0x555df0f7cd00] mmco: unref short failure +[h264 @ 0x555df0f7cd00] mmco: unref short failure +[h264 @ 0x555dece38b80] mmco: unref short failure +[h264 @ 0x555dece38b80] mmco: unref short failure +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x55d956f36840] mmco: unref short failure +[h264 @ 0x55d956f36840] mmco: unref short failure +[h264 @ 0x55d95707f900] mmco: unref short failure +[h264 @ 0x55d95707f900] mmco: unref short failure +[h264 @ 0x55d955b8bcc0] mmco: unref short failure +[h264 @ 0x55d955b8bcc0] mmco: unref short failure +[h264 @ 0x55d956f36840] mmco: unref short failure +[h264 @ 0x55d956f36840] mmco: unref short failure +[h264 @ 0x55d95707f900] mmco: unref short failure +[h264 @ 0x555df0f7cd00] mmco: unref short failure +[h264 @ 0x555df0f7cd00] mmco: unref short failure +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x55d956f36840] mmco: unref short failure +[h264 @ 0x55d956f36840] mmco: unref short failure +[h264 @ 0x555df0f7cd00] mmco: unref short failure +[h264 @ 0x555df0f7cd00] mmco: unref short failure +[h264 @ 0x55d95707f900] mmco: unref short failure +[h264 @ 0x55d95707f900] mmco: unref short failure +[h264 @ 0x555df0f7cd00] mmco: unref short failure +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x55d956f36840] mmco: unref short failure +[h264 @ 0x55d956f36840] mmco: unref short failure +[h264 @ 0x55d956f36840] mmco: unref short failure +[h264 @ 0x55d95707f900] mmco: unref short failure +[h264 @ 0x55d95707f900] mmco: unref short failure +[h264 @ 0x555df0f7cd00] mmco: unref short failure +[h264 @ 0x555df0f7cd00] mmco: unref short failure +[h264 @ 0x55d956f36840] mmco: unref short failure +[h264 @ 0x55d956f36840] mmco: unref short failure +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x55d95707f900] mmco: unref short failure +[h264 @ 0x555dee588f00] mmco: unref short failure +[h264 @ 0x55d95d112a80] mmco: unref short failure +[h264 @ 0x555dee588f00] mmco: unref short failure +[h264 @ 0x555dee588f00] mmco: unref short failure +[h264 @ 0x55d95d112a80] mmco: unref short failure +[h264 @ 0x55d95d112a80] mmco: unref short failure +[h264 @ 0x555dee588f00] mmco: unref short failure +[h264 @ 0x55d95d112a80] mmco: unref short failure +[h264 @ 0x555dee588f00] mmco: unref short failure +[h264 @ 0x55d95d112a80] mmco: unref short failure +[h264 @ 0x555dee588f00] mmco: unref short failure +[h264 @ 0x55d95d112a80] mmco: unref short failure + [2024-11-27 19:43:34] iteration 266/ 1000 | consumed samples: 17024 | elapsed time per iteration (ms): 75962.2 | throughput per GPU (TFLOP/s/GPU): 101.5 | learning rate: 4.318496E-06 | global batch size: 64 | lm loss: 7.168859E-01 | loss scale: 1.0 | grad norm: 1.011 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555dee1d4480] mmco: unref short failure +[h264 @ 0x55d957a1cec0] mmco: unref short failure +[h264 @ 0x555dee1d4480] mmco: unref short failure +[h264 @ 0x555dee1d4480] mmco: unref short failure +[h264 @ 0x55d957a1cec0] mmco: unref short failure +[h264 @ 0x55d957a1cec0] mmco: unref short failure + [2024-11-27 19:44:52] iteration 267/ 1000 | consumed samples: 17088 | elapsed time per iteration (ms): 77862.5 | throughput per GPU (TFLOP/s/GPU): 99.0 | learning rate: 4.312995E-06 | global batch size: 64 | lm loss: 7.042320E-01 | loss scale: 1.0 | grad norm: 0.832 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555df32c1f40] mmco: unref short failure +[h264 @ 0x555df32c1f40] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x55d959b34680] mmco: unref short failure +[h264 @ 0x55d959b34680] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x55d959b34680] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure + [2024-11-27 19:46:26] iteration 268/ 1000 | consumed samples: 17152 | elapsed time per iteration (ms): 94079.1 | throughput per GPU (TFLOP/s/GPU): 81.9 | learning rate: 4.307476E-06 | global batch size: 64 | lm loss: 7.036880E-01 | loss scale: 1.0 | grad norm: 0.906 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x55d95a292180] mmco: unref short failure +[h264 @ 0x55d95a292180] mmco: unref short failure +[h264 @ 0x555dee588f00] mmco: unref short failure +[h264 @ 0x555dee588f00] mmco: unref short failure +[h264 @ 0x555dec811980] mmco: unref short failure +[h264 @ 0x55d957a91640] mmco: unref short failure +[h264 @ 0x555dec811980] mmco: unref short failure +[h264 @ 0x55d957a91640] mmco: unref short failure +[h264 @ 0x555dec811980] mmco: unref short failure +[h264 @ 0x55d957a91640] mmco: unref short failure +[h264 @ 0x555dec811980] mmco: unref short failure +[h264 @ 0x55d957a91640] mmco: unref short failure +[h264 @ 0x555dec811980] mmco: unref short failure +[h264 @ 0x555dec811980] mmco: unref short failure +[h264 @ 0x55d957a91640] mmco: unref short failure +[h264 @ 0x55d957a91640] mmco: unref short failure + [2024-11-27 19:47:47] iteration 269/ 1000 | consumed samples: 17216 | elapsed time per iteration (ms): 80930.5 | throughput per GPU (TFLOP/s/GPU): 95.2 | learning rate: 4.301938E-06 | global batch size: 64 | lm loss: 8.056663E-01 | loss scale: 1.0 | grad norm: 1.143 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x55d956f36840] mmco: unref short failure +[h264 @ 0x555ded7e1040] mmco: unref short failure +[h264 @ 0x55d95a292180] mmco: unref short failure +[h264 @ 0x55d95a292180] mmco: unref short failure +[h264 @ 0x555dee588f00] mmco: unref short failure +[h264 @ 0x555dee588f00] mmco: unref short failure +[h264 @ 0x55d956f36840] mmco: unref short failure +[h264 @ 0x55d956f36840] mmco: unref short failure +[h264 @ 0x555ded7e1040] mmco: unref short failure +[h264 @ 0x555ded7e1040] mmco: unref short failure + [2024-11-27 19:49:17] iteration 270/ 1000 | consumed samples: 17280 | elapsed time per iteration (ms): 90603.4 | throughput per GPU (TFLOP/s/GPU): 85.1 | learning rate: 4.296382E-06 | global batch size: 64 | lm loss: 6.936717E-01 | loss scale: 1.0 | grad norm: 1.050 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555ded7e1040] mmco: unref short failure +[h264 @ 0x555ded7e1040] mmco: unref short failure +[h264 @ 0x555ded7e1040] mmco: unref short failure +[h264 @ 0x555ded7e1040] mmco: unref short failure +[h264 @ 0x555ded7e1040] mmco: unref short failure +[h264 @ 0x55d956f36840] mmco: unref short failure +[h264 @ 0x55d956f36840] mmco: unref short failure +[h264 @ 0x55d956f36840] mmco: unref short failure +[h264 @ 0x55d956f36840] mmco: unref short failure +[h264 @ 0x55d956f36840] mmco: unref short failure +[h264 @ 0x555ded7e1040] mmco: unref short failure +[h264 @ 0x555ded7e1040] mmco: unref short failure +[h264 @ 0x555ded7e1040] mmco: unref short failure +[h264 @ 0x555ded7e1040] mmco: unref short failure +[h264 @ 0x55d956f36840] mmco: unref short failure +[h264 @ 0x55d956f36840] mmco: unref short failure +[h264 @ 0x55d956f36840] mmco: unref short failure +[h264 @ 0x55d956f36840] mmco: unref short failure + [2024-11-27 19:52:28] iteration 271/ 1000 | consumed samples: 17344 | elapsed time per iteration (ms): 190195.6 | throughput per GPU (TFLOP/s/GPU): 40.5 | learning rate: 4.290807E-06 | global batch size: 64 | lm loss: 7.112800E-01 | loss scale: 1.0 | grad norm: 0.864 | number of skipped iterations: 0 | number of nan iterations: 0 | + [2024-11-27 19:54:11] iteration 272/ 1000 | consumed samples: 17408 | elapsed time per iteration (ms): 103571.7 | throughput per GPU (TFLOP/s/GPU): 74.4 | learning rate: 4.285215E-06 | global batch size: 64 | lm loss: 7.187757E-01 | loss scale: 1.0 | grad norm: 0.988 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555dece4d600] mmco: unref short failure +[h264 @ 0x555dece4d600] mmco: unref short failure +[h264 @ 0x55d956ee5780] mmco: unref short failure +[h264 @ 0x55d956ee5780] mmco: unref short failure +[h264 @ 0x555dece4d600] mmco: unref short failure +[h264 @ 0x55d956ee5780] mmco: unref short failure +[h264 @ 0x555dece4d600] mmco: unref short failure +[h264 @ 0x55d956ee5780] mmco: unref short failure +[h264 @ 0x55d95d112a80] mmco: unref short failure +[h264 @ 0x55d95d112a80] mmco: unref short failure +[h264 @ 0x555dedfc6580] mmco: unref short failure +[h264 @ 0x555dedfc6580] mmco: unref short failure +[h264 @ 0x55d956f36840] mmco: unref short failure +[h264 @ 0x55d956f36840] mmco: unref short failure +[h264 @ 0x555ded7e1040] mmco: unref short failure +[h264 @ 0x555ded7e1040] mmco: unref short failure +[h264 @ 0x55d956f36840] mmco: unref short failure +[h264 @ 0x55d956f36840] mmco: unref short failure +[h264 @ 0x555ded7e1040] mmco: unref short failure +[h264 @ 0x555ded7e1040] mmco: unref short failure +[h264 @ 0x555dee700e80] mmco: unref short failure +[h264 @ 0x555dee700e80] mmco: unref short failure +[h264 @ 0x55d957a91640] mmco: unref short failure +[h264 @ 0x55d957a91640] mmco: unref short failure + [2024-11-27 19:55:39] iteration 273/ 1000 | consumed samples: 17472 | elapsed time per iteration (ms): 88072.4 | throughput per GPU (TFLOP/s/GPU): 87.5 | learning rate: 4.279604E-06 | global batch size: 64 | lm loss: 7.316838E-01 | loss scale: 1.0 | grad norm: 1.097 | number of skipped iterations: 0 | number of nan iterations: 0 | +processed_samples 1000 unjoint_samples 1000 joint_samples 71 [34696, 112979] +processed_samples 1000 unjoint_samples 1000 joint_samples 71 [34696, 112979] +processed_samples 1000 unjoint_samples 1000 joint_samples 71 [97008, 105349] +processed_samples 1000 unjoint_samples 1000 joint_samples 71 [97008, 105349] +processed_samples 1000 unjoint_samples 1000 joint_samples 75 [56126, 109719] +processed_samples 1000 unjoint_samples 1000 joint_samples 75 [56126, 109719] +processed_samples 1000 unjoint_samples 1000 joint_samples 64 [123627, 39784] +processed_samples 1000 unjoint_samples 1000 joint_samples 64 [123627, 39784] +processed_samples 1000 unjoint_samples 1000 joint_samples 69 [122492, 103007] +processed_samples 1000 unjoint_samples 1000 joint_samples 69 [122492, 103007] +processed_samples 1000 unjoint_samples 1000 joint_samples 66 [100663, 105124] +processed_samples 1000 unjoint_samples 1000 joint_samples 66 [100663, 105124] +[h264 @ 0x55d959b34680] mmco: unref short failure +[h264 @ 0x55d959b34680] mmco: unref short failure +[h264 @ 0x555df4961e80] mmco: unref short failure +[h264 @ 0x555df4961e80] mmco: unref short failure +[h264 @ 0x55d959b34680] mmco: unref short failure +[h264 @ 0x55d959b34680] mmco: unref short failure +[h264 @ 0x555df4961e80] mmco: unref short failure +[h264 @ 0x555df4961e80] mmco: unref short failure +processed_samples 1000 unjoint_samples 1000 joint_samples 63 [87351, 127977] +processed_samples 1000 unjoint_samples 1000 joint_samples 63 [87351, 127977] +processed_samples 1000 unjoint_samples 1000 joint_samples 68 [127446, 15741] +processed_samples 1000 unjoint_samples 1000 joint_samples 68 [127446, 15741] +[h264 @ 0x55d959b34680] mmco: unref short failure +[h264 @ 0x555df4961e80] mmco: unref short failure +[h264 @ 0x55d959b34680] mmco: unref short failure +[h264 @ 0x555df4961e80] mmco: unref short failure +[h264 @ 0x555df4961e80] mmco: unref short failure +[h264 @ 0x555deda84c00] mmco: unref short failure +[h264 @ 0x55d957c6e140] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x55d957a91640] mmco: unref short failure +[h264 @ 0x55d957a91640] mmco: unref short failure +[h264 @ 0x555dee700e80] mmco: unref short failure +[h264 @ 0x555dee700e80] mmco: unref short failure +[h264 @ 0x555deda84c00] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x555df4961e80] mmco: unref short failure +[h264 @ 0x55d957c6e140] mmco: unref short failure +[h264 @ 0x55d957a91640] mmco: unref short failure +[h264 @ 0x555dee700e80] mmco: unref short failure +[h264 @ 0x55d957a91640] mmco: unref short failure +[h264 @ 0x55d957a91640] mmco: unref short failure +[h264 @ 0x555dee700e80] mmco: unref short failure +[h264 @ 0x555dee700e80] mmco: unref short failure +[h264 @ 0x55d957a91640] mmco: unref short failure +[h264 @ 0x555dee700e80] mmco: unref short failure +[h264 @ 0x55d957a91640] mmco: unref short failure +[h264 @ 0x55d957a91640] mmco: unref short failure +[h264 @ 0x555dee700e80] mmco: unref short failure +[h264 @ 0x555dee700e80] mmco: unref short failure + [2024-11-27 19:57:09] iteration 274/ 1000 | consumed samples: 17536 | elapsed time per iteration (ms): 90003.2 | throughput per GPU (TFLOP/s/GPU): 85.6 | learning rate: 4.273975E-06 | global batch size: 64 | lm loss: 6.670929E-01 | loss scale: 1.0 | grad norm: 1.001 | number of skipped iterations: 0 | number of nan iterations: 0 | + [2024-11-27 19:58:29] iteration 275/ 1000 | consumed samples: 17600 | elapsed time per iteration (ms): 79210.1 | throughput per GPU (TFLOP/s/GPU): 97.3 | learning rate: 4.268328E-06 | global batch size: 64 | lm loss: 6.567287E-01 | loss scale: 1.0 | grad norm: 1.149 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x555deda84c00] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x555deda84c00] mmco: unref short failure +[h264 @ 0x555deda84c00] mmco: unref short failure +[h264 @ 0x555deee52400] mmco: unref short failure +[h264 @ 0x555deee52400] mmco: unref short failure +[h264 @ 0x55d95b718d00] mmco: unref short failure +[h264 @ 0x55d95b718d00] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x555deda84c00] mmco: unref short failure +[h264 @ 0x555deda84c00] mmco: unref short failure +[h264 @ 0x555deda84c00] mmco: unref short failure +[h264 @ 0x555deda84c00] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure + [2024-11-27 20:00:53] iteration 276/ 1000 | consumed samples: 17664 | elapsed time per iteration (ms): 144170.2 | throughput per GPU (TFLOP/s/GPU): 53.5 | learning rate: 4.262663E-06 | global batch size: 64 | lm loss: 6.671741E-01 | loss scale: 1.0 | grad norm: 0.872 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555dee1d4480] mmco: unref short failure +[h264 @ 0x555dee1d4480] mmco: unref short failure +[h264 @ 0x55d957a1cec0] mmco: unref short failure +[h264 @ 0x55d957a1cec0] mmco: unref short failure +[h264 @ 0x555dee1d4480] mmco: unref short failure +[h264 @ 0x555dee1d4480] mmco: unref short failure +[h264 @ 0x55d957a1cec0] mmco: unref short failure +[h264 @ 0x55d957a1cec0] mmco: unref short failure + [2024-11-27 20:02:33] iteration 277/ 1000 | consumed samples: 17728 | elapsed time per iteration (ms): 100716.0 | throughput per GPU (TFLOP/s/GPU): 76.5 | learning rate: 4.256980E-06 | global batch size: 64 | lm loss: 6.349506E-01 | loss scale: 1.0 | grad norm: 1.063 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555dee1d4480] mmco: unref short failure +[h264 @ 0x555dee1d4480] mmco: unref short failure +[h264 @ 0x55d956f7b580] mmco: unref short failure +[h264 @ 0x55d956f7b580] mmco: unref short failure +[h264 @ 0x555deda84c00] mmco: unref short failure +[h264 @ 0x555deda84c00] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure + [2024-11-27 20:04:16] iteration 278/ 1000 | consumed samples: 17792 | elapsed time per iteration (ms): 102385.9 | throughput per GPU (TFLOP/s/GPU): 75.3 | learning rate: 4.251279E-06 | global batch size: 64 | lm loss: 6.847430E-01 | loss scale: 1.0 | grad norm: 2.834 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555ded7e1040] mmco: unref short failure +[h264 @ 0x55d956f36840] mmco: unref short failure +[h264 @ 0x555ded7e1040] mmco: unref short failure +[h264 @ 0x555ded7e1040] mmco: unref short failure +[h264 @ 0x555ded7e1040] mmco: unref short failure +[h264 @ 0x55d956f36840] mmco: unref short failure +[h264 @ 0x55d956f36840] mmco: unref short failure +[h264 @ 0x55d956f36840] mmco: unref short failure +[h264 @ 0x555ded7e1040] mmco: unref short failure +[h264 @ 0x555ded7e1040] mmco: unref short failure +[h264 @ 0x555ded7e1040] mmco: unref short failure +[h264 @ 0x55d956f36840] mmco: unref short failure +[h264 @ 0x55d956f36840] mmco: unref short failure +[h264 @ 0x55d956f36840] mmco: unref short failure +[h264 @ 0x555ded7e1040] mmco: unref short failure +[h264 @ 0x55d956f36840] mmco: unref short failure + [2024-11-27 20:05:50] iteration 279/ 1000 | consumed samples: 17856 | elapsed time per iteration (ms): 93700.5 | throughput per GPU (TFLOP/s/GPU): 82.3 | learning rate: 4.245560E-06 | global batch size: 64 | lm loss: 6.650225E-01 | loss scale: 1.0 | grad norm: 0.928 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x55d957a1cec0] mmco: unref short failure +[h264 @ 0x555dee1d4480] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x55d9568d0900] mmco: unref short failure +[h264 @ 0x55d9568d0900] mmco: unref short failure + [2024-11-27 20:07:22] iteration 280/ 1000 | consumed samples: 17920 | elapsed time per iteration (ms): 92883.3 | throughput per GPU (TFLOP/s/GPU): 83.0 | learning rate: 4.239823E-06 | global batch size: 64 | lm loss: 7.579253E-01 | loss scale: 1.0 | grad norm: 1.494 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555dedf05880] mmco: unref short failure +[h264 @ 0x555df0f7cd00] mmco: unref short failure +[h264 @ 0x555df0f7cd00] mmco: unref short failure +[h264 @ 0x55d957f0f640] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x55d95b718d00] mmco: unref short failure +[h264 @ 0x55d95b718d00] mmco: unref short failure +[h264 @ 0x555deee52400] mmco: unref short failure +[h264 @ 0x555deee52400] mmco: unref short failure +[h264 @ 0x555dedf05880] mmco: unref short failure +[h264 @ 0x555df0f7cd00] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x55d957f0f640] mmco: unref short failure +[h264 @ 0x555df0f7cd00] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x555df0f7cd00] mmco: unref short failure +[h264 @ 0x555df0f7cd00] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x555df0f7cd00] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x555df0f7cd00] mmco: unref short failure +[h264 @ 0x555deee52400] mmco: unref short failure +[h264 @ 0x555deee52400] mmco: unref short failure +[h264 @ 0x55d95b718d00] mmco: unref short failure +[h264 @ 0x55d95b718d00] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x555df0f7cd00] mmco: unref short failure +[h264 @ 0x555df0f7cd00] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x55d957f0f640] mmco: unref short failure +[h264 @ 0x55d957f0f640] mmco: unref short failure +[h264 @ 0x55d957cd6140] mmco: unref short failure +[h264 @ 0x555ded0b5480] mmco: unref short failure +[h264 @ 0x555dee1d4480] mmco: unref short failure +[h264 @ 0x55d957a1cec0] mmco: unref short failure + [2024-11-27 20:08:44] iteration 281/ 1000 | consumed samples: 17984 | elapsed time per iteration (ms): 81491.8 | throughput per GPU (TFLOP/s/GPU): 94.6 | learning rate: 4.234069E-06 | global batch size: 64 | lm loss: 6.707730E-01 | loss scale: 1.0 | grad norm: 0.890 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555dee1d4480] mmco: unref short failure +[h264 @ 0x55d956f537c0] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure + [2024-11-27 20:10:12] iteration 282/ 1000 | consumed samples: 18048 | elapsed time per iteration (ms): 87592.8 | throughput per GPU (TFLOP/s/GPU): 88.0 | learning rate: 4.228297E-06 | global batch size: 64 | lm loss: 7.038647E-01 | loss scale: 1.0 | grad norm: 0.939 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555deee52400] mmco: unref short failure +[h264 @ 0x555deee52400] mmco: unref short failure +[h264 @ 0x555dec5a5140] mmco: unref short failure +[h264 @ 0x55d95b718d00] mmco: unref short failure +[h264 @ 0x55d95b718d00] mmco: unref short failure +[h264 @ 0x55d956386f00] mmco: unref short failure +[h264 @ 0x555dec5a5140] mmco: unref short failure +[h264 @ 0x555dec5a5140] mmco: unref short failure +[h264 @ 0x55d956386f00] mmco: unref short failure +[h264 @ 0x55d956386f00] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x55d9568d0900] mmco: unref short failure +[h264 @ 0x55d9568d0900] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x55d9568d0900] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x55d9568d0900] mmco: unref short failure +[h264 @ 0x55d9568d0900] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x55d9568d0900] mmco: unref short failure +[h264 @ 0x55d9568d0900] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x55d9568d0900] mmco: unref short failure +[h264 @ 0x55d9568d0900] mmco: unref short failure + [2024-11-27 20:11:33] iteration 283/ 1000 | consumed samples: 18112 | elapsed time per iteration (ms): 81693.7 | throughput per GPU (TFLOP/s/GPU): 94.4 | learning rate: 4.222507E-06 | global batch size: 64 | lm loss: 7.013253E-01 | loss scale: 1.0 | grad norm: 1.013 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555dec6aab00] mmco: unref short failure +[h264 @ 0x555dec6aab00] mmco: unref short failure +[h264 @ 0x55d95873d100] mmco: unref short failure +[h264 @ 0x55d95873d100] mmco: unref short failure + [2024-11-27 20:12:53] iteration 284/ 1000 | consumed samples: 18176 | elapsed time per iteration (ms): 79464.8 | throughput per GPU (TFLOP/s/GPU): 97.0 | learning rate: 4.216700E-06 | global batch size: 64 | lm loss: 7.138371E-01 | loss scale: 1.0 | grad norm: 0.863 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555dec6aab00] mmco: unref short failure +[h264 @ 0x555dec6aab00] mmco: unref short failure +[h264 @ 0x55d95873d100] mmco: unref short failure +[h264 @ 0x55d95873d100] mmco: unref short failure +[h264 @ 0x555dec6aab00] mmco: unref short failure +[h264 @ 0x55d95873d100] mmco: unref short failure +[h264 @ 0x55d95873d100] mmco: unref short failure +[h264 @ 0x55d95873d100] mmco: unref short failure +[h264 @ 0x555dec6aab00] mmco: unref short failure +[h264 @ 0x555dec6aab00] mmco: unref short failure +[h264 @ 0x55d9568d0900] mmco: unref short failure +[h264 @ 0x55d9568d0900] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x555dee792700] mmco: unref short failure +[h264 @ 0x55d956b9c3c0] mmco: unref short failure +[h264 @ 0x55d9568d0900] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x555dee792700] mmco: unref short failure +[h264 @ 0x555dee792700] mmco: unref short failure +[h264 @ 0x55d956b9c3c0] mmco: unref short failure +[h264 @ 0x55d956b9c3c0] mmco: unref short failure +[h264 @ 0x555dee792700] mmco: unref short failure +[h264 @ 0x555dee792700] mmco: unref short failure +[h264 @ 0x55d956b9c3c0] mmco: unref short failure +[h264 @ 0x55d956b9c3c0] mmco: unref short failure +[h264 @ 0x555dec6aab00] mmco: unref short failure +[h264 @ 0x55d956b9c3c0] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure + [2024-11-27 20:14:34] iteration 285/ 1000 | consumed samples: 18240 | elapsed time per iteration (ms): 100817.1 | throughput per GPU (TFLOP/s/GPU): 76.5 | learning rate: 4.210876E-06 | global batch size: 64 | lm loss: 6.813183E-01 | loss scale: 1.0 | grad norm: 0.847 | number of skipped iterations: 0 | number of nan iterations: 0 | + [2024-11-27 20:15:48] iteration 286/ 1000 | consumed samples: 18304 | elapsed time per iteration (ms): 74261.4 | throughput per GPU (TFLOP/s/GPU): 103.8 | learning rate: 4.205033E-06 | global batch size: 64 | lm loss: 6.868650E-01 | loss scale: 1.0 | grad norm: 0.842 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x555dec6aab00] mmco: unref short failure +[h264 @ 0x55d95873d100] mmco: unref short failure +[h264 @ 0x555dec6aab00] mmco: unref short failure +[h264 @ 0x555dec6aab00] mmco: unref short failure +[h264 @ 0x55d95873d100] mmco: unref short failure +[h264 @ 0x55d95873d100] mmco: unref short failure +[h264 @ 0x55d95873d100] mmco: unref short failure +[h264 @ 0x55d95873d100] mmco: unref short failure +[h264 @ 0x555dec6aab00] mmco: unref short failure +[h264 @ 0x555dec6aab00] mmco: unref short failure + [2024-11-27 20:17:10] iteration 287/ 1000 | consumed samples: 18368 | elapsed time per iteration (ms): 81979.8 | throughput per GPU (TFLOP/s/GPU): 94.0 | learning rate: 4.199174E-06 | global batch size: 64 | lm loss: 6.615226E-01 | loss scale: 1.0 | grad norm: 0.793 | number of skipped iterations: 0 | number of nan iterations: 0 | + [2024-11-27 20:18:43] iteration 288/ 1000 | consumed samples: 18432 | elapsed time per iteration (ms): 92724.0 | throughput per GPU (TFLOP/s/GPU): 83.1 | learning rate: 4.193297E-06 | global batch size: 64 | lm loss: 6.780793E-01 | loss scale: 1.0 | grad norm: 0.835 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555ded0b5480] mmco: unref short failure +[h264 @ 0x555ded0b5480] mmco: unref short failure +[h264 @ 0x55d956767080] mmco: unref short failure +[h264 @ 0x55d956767080] mmco: unref short failure +[h264 @ 0x555ded0b5480] mmco: unref short failure +[h264 @ 0x555ded0b5480] mmco: unref short failure +[h264 @ 0x55d956767080] mmco: unref short failure +[h264 @ 0x55d956767080] mmco: unref short failure +[h264 @ 0x555ded0b5480] mmco: unref short failure +[h264 @ 0x555ded0b5480] mmco: unref short failure +[h264 @ 0x555ded0b5480] mmco: unref short failure +[h264 @ 0x555ded0b5480] mmco: unref short failure +[h264 @ 0x55d956767080] mmco: unref short failure +[h264 @ 0x55d956767080] mmco: unref short failure +[h264 @ 0x55d956767080] mmco: unref short failure +[h264 @ 0x55d956767080] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure + [2024-11-27 20:20:12] iteration 289/ 1000 | consumed samples: 18496 | elapsed time per iteration (ms): 89460.1 | throughput per GPU (TFLOP/s/GPU): 86.2 | learning rate: 4.187403E-06 | global batch size: 64 | lm loss: 6.989150E-01 | loss scale: 1.0 | grad norm: 0.779 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555ded0b5480] mmco: unref short failure +[h264 @ 0x555dec5a5140] mmco: unref short failure +[h264 @ 0x55d956b88f40] mmco: unref short failure +[h264 @ 0x555ded0b5480] mmco: unref short failure +[h264 @ 0x555ded0b5480] mmco: unref short failure +[h264 @ 0x55d956767080] mmco: unref short failure +[h264 @ 0x55d956767080] mmco: unref short failure +[h264 @ 0x55d956767080] mmco: unref short failure +[h264 @ 0x555dedbd1740] mmco: unref short failure +[h264 @ 0x55d956d792c0] mmco: unref short failure +[h264 @ 0x555dec5a5140] mmco: unref short failure +[h264 @ 0x555dec5a5140] mmco: unref short failure +[h264 @ 0x55d956b88f40] mmco: unref short failure +[h264 @ 0x55d956b88f40] mmco: unref short failure +[h264 @ 0x555ded0b5480] mmco: unref short failure +[h264 @ 0x55d956767080] mmco: unref short failure + [2024-11-27 20:21:52] iteration 290/ 1000 | consumed samples: 18560 | elapsed time per iteration (ms): 99500.0 | throughput per GPU (TFLOP/s/GPU): 77.5 | learning rate: 4.181492E-06 | global batch size: 64 | lm loss: 6.454962E-01 | loss scale: 1.0 | grad norm: 0.929 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x55d95b718d00] mmco: unref short failure +[h264 @ 0x55d95b718d00] mmco: unref short failure +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x555dedf1d500] mmco: unref short failure +[h264 @ 0x555dedf1d500] mmco: unref short failure +[h264 @ 0x55d956f78600] mmco: unref short failure +[h264 @ 0x55d956f78600] mmco: unref short failure +[h264 @ 0x555dedf1d500] mmco: unref short failure +[h264 @ 0x55d956f78600] mmco: unref short failure +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x55d959911280] mmco: unref short failure +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x55d959911280] mmco: unref short failure +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x55d959911280] mmco: unref short failure +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x55d959911280] mmco: unref short failure +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x55d959911280] mmco: unref short failure +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x55d959911280] mmco: unref short failure +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x55d959911280] mmco: unref short failure +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x55d959911280] mmco: unref short failure +[h264 @ 0x555df4961e80] mmco: unref short failure +[h264 @ 0x555df4961e80] mmco: unref short failure +[h264 @ 0x555dee100dc0] mmco: unref short failure +[h264 @ 0x555dee100dc0] mmco: unref short failure +[h264 @ 0x55d959cd93c0] mmco: unref short failure +[h264 @ 0x55d959cd93c0] mmco: unref short failure +[h264 @ 0x55d956defa80] mmco: unref short failure +[h264 @ 0x55d956defa80] mmco: unref short failure + [2024-11-27 20:23:08] iteration 291/ 1000 | consumed samples: 18624 | elapsed time per iteration (ms): 76484.1 | throughput per GPU (TFLOP/s/GPU): 100.8 | learning rate: 4.175564E-06 | global batch size: 64 | lm loss: 6.725484E-01 | loss scale: 1.0 | grad norm: 0.911 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x55d956defa80] mmco: unref short failure +[h264 @ 0x55d956defa80] mmco: unref short failure +[h264 @ 0x555dee100dc0] mmco: unref short failure +[h264 @ 0x555dee100dc0] mmco: unref short failure +[h264 @ 0x55d956f78600] mmco: unref short failure +[h264 @ 0x555dedf1d500] mmco: unref short failure + [2024-11-27 20:24:16] iteration 292/ 1000 | consumed samples: 18688 | elapsed time per iteration (ms): 68308.3 | throughput per GPU (TFLOP/s/GPU): 112.8 | learning rate: 4.169619E-06 | global batch size: 64 | lm loss: 7.150757E-01 | loss scale: 1.0 | grad norm: 0.964 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x55d95b718d00] mmco: unref short failure +[h264 @ 0x55d95b718d00] mmco: unref short failure +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x55d95b718d00] mmco: unref short failure +[h264 @ 0x55d95b718d00] mmco: unref short failure +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x55d95b718d00] mmco: unref short failure +[h264 @ 0x55d95b718d00] mmco: unref short failure +[h264 @ 0x55d95b718d00] mmco: unref short failure +[h264 @ 0x55d95b718d00] mmco: unref short failure +[h264 @ 0x55d957a91640] mmco: unref short failure +[h264 @ 0x555dee1d4480] mmco: unref short failure +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x55d95b718d00] mmco: unref short failure +[h264 @ 0x55d95b718d00] mmco: unref short failure +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x55d95b718d00] mmco: unref short failure +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x55d95b718d00] mmco: unref short failure +[h264 @ 0x55d95b718d00] mmco: unref short failure +[h264 @ 0x555df333fac0] mmco: unref short failure + [2024-11-27 20:25:37] iteration 293/ 1000 | consumed samples: 18752 | elapsed time per iteration (ms): 80063.3 | throughput per GPU (TFLOP/s/GPU): 96.3 | learning rate: 4.163656E-06 | global batch size: 64 | lm loss: 6.578319E-01 | loss scale: 1.0 | grad norm: 0.853 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x55d95b718d00] mmco: unref short failure +[h264 @ 0x55d95b718d00] mmco: unref short failure + [2024-11-27 20:27:17] iteration 294/ 1000 | consumed samples: 18816 | elapsed time per iteration (ms): 100255.9 | throughput per GPU (TFLOP/s/GPU): 76.9 | learning rate: 4.157677E-06 | global batch size: 64 | lm loss: 6.832387E-01 | loss scale: 1.0 | grad norm: 0.984 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555dee100dc0] mmco: unref short failure +[h264 @ 0x555dee100dc0] mmco: unref short failure +[h264 @ 0x55d956defa80] mmco: unref short failure +[h264 @ 0x55d956defa80] mmco: unref short failure + [2024-11-27 20:31:05] iteration 295/ 1000 | consumed samples: 18880 | elapsed time per iteration (ms): 228503.4 | throughput per GPU (TFLOP/s/GPU): 33.7 | learning rate: 4.151681E-06 | global batch size: 64 | lm loss: 6.679215E-01 | loss scale: 1.0 | grad norm: 0.773 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555df4961e80] mmco: unref short failure +[h264 @ 0x555df4961e80] mmco: unref short failure +[h264 @ 0x55d957883f80] mmco: unref short failure +[h264 @ 0x55d957883f80] mmco: unref short failure +[h264 @ 0x555df4961e80] mmco: unref short failure +[h264 @ 0x55d957883f80] mmco: unref short failure +[h264 @ 0x555df4961e80] mmco: unref short failure +[h264 @ 0x555df4961e80] mmco: unref short failure +[h264 @ 0x55d957883f80] mmco: unref short failure +[h264 @ 0x55d957883f80] mmco: unref short failure +[h264 @ 0x555df4961e80] mmco: unref short failure +[h264 @ 0x555df4961e80] mmco: unref short failure +[h264 @ 0x55d957883f80] mmco: unref short failure +[h264 @ 0x55d957883f80] mmco: unref short failure + [2024-11-27 20:32:22] iteration 296/ 1000 | consumed samples: 18944 | elapsed time per iteration (ms): 76937.8 | throughput per GPU (TFLOP/s/GPU): 100.2 | learning rate: 4.145668E-06 | global batch size: 64 | lm loss: 6.361890E-01 | loss scale: 1.0 | grad norm: 1.200 | number of skipped iterations: 0 | number of nan iterations: 0 | + [2024-11-27 20:33:54] iteration 297/ 1000 | consumed samples: 19008 | elapsed time per iteration (ms): 92103.0 | throughput per GPU (TFLOP/s/GPU): 83.7 | learning rate: 4.139639E-06 | global batch size: 64 | lm loss: 7.190810E-01 | loss scale: 1.0 | grad norm: 1.149 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x55d956f98f80] mmco: unref short failure +[h264 @ 0x55d956f98f80] mmco: unref short failure +[h264 @ 0x555df32c1f40] mmco: unref short failure +[h264 @ 0x55d95707f900] mmco: unref short failure +[h264 @ 0x555dee792700] mmco: unref short failure +[h264 @ 0x555dee792700] mmco: unref short failure +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x555ded70ae00] mmco: unref short failure +[h264 @ 0x555def216fc0] mmco: unref short failure +[h264 @ 0x555def216fc0] mmco: unref short failure +[h264 @ 0x55d956767080] mmco: unref short failure +[h264 @ 0x55d956767080] mmco: unref short failure +[h264 @ 0x55d956f537c0] mmco: unref short failure +[h264 @ 0x55d956f537c0] mmco: unref short failure +[h264 @ 0x55d95c2eb200] mmco: unref short failure +[h264 @ 0x55d956f98f80] mmco: unref short failure +[h264 @ 0x555def216fc0] mmco: unref short failure +[h264 @ 0x55d956f537c0] mmco: unref short failure +[h264 @ 0x555def216fc0] mmco: unref short failure +[h264 @ 0x55d956f537c0] mmco: unref short failure +[h264 @ 0x555def216fc0] mmco: unref short failure +[h264 @ 0x555def216fc0] mmco: unref short failure +[h264 @ 0x55d956f537c0] mmco: unref short failure +[h264 @ 0x55d956f537c0] mmco: unref short failure +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x55d956f98f80] mmco: unref short failure +[h264 @ 0x555def216fc0] mmco: unref short failure +[h264 @ 0x55d956f537c0] mmco: unref short failure +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x555def216fc0] mmco: unref short failure +[h264 @ 0x55d956f537c0] mmco: unref short failure +[h264 @ 0x55d956f98f80] mmco: unref short failure +[h264 @ 0x555def216fc0] mmco: unref short failure +[h264 @ 0x55d956f537c0] mmco: unref short failure +[h264 @ 0x555def216fc0] mmco: unref short failure +[h264 @ 0x555def216fc0] mmco: unref short failure +[h264 @ 0x55d956f537c0] mmco: unref short failure +[h264 @ 0x55d956f537c0] mmco: unref short failure +[h264 @ 0x555def216fc0] mmco: unref short failure +[h264 @ 0x555def216fc0] mmco: unref short failure +[h264 @ 0x55d956f537c0] mmco: unref short failure +[h264 @ 0x55d956f537c0] mmco: unref short failure +[h264 @ 0x555def216fc0] mmco: unref short failure +[h264 @ 0x555def216fc0] mmco: unref short failure +[h264 @ 0x55d956f537c0] mmco: unref short failure +[h264 @ 0x55d956f537c0] mmco: unref short failure + [2024-11-27 20:37:06] iteration 298/ 1000 | consumed samples: 19072 | elapsed time per iteration (ms): 191713.3 | throughput per GPU (TFLOP/s/GPU): 40.2 | learning rate: 4.133592E-06 | global batch size: 64 | lm loss: 6.651546E-01 | loss scale: 1.0 | grad norm: 0.881 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555df32c1f40] mmco: unref short failure +[h264 @ 0x555df32c1f40] mmco: unref short failure +[h264 @ 0x55d956f36840] mmco: unref short failure +[h264 @ 0x55d956f36840] mmco: unref short failure +[h264 @ 0x555df32c1f40] mmco: unref short failure +[h264 @ 0x555df32c1f40] mmco: unref short failure +[h264 @ 0x55d956f36840] mmco: unref short failure +[h264 @ 0x55d956f36840] mmco: unref short failure +[h264 @ 0x555df32c1f40] mmco: unref short failure +[h264 @ 0x55d956f36840] mmco: unref short failure +[h264 @ 0x555df32c1f40] mmco: unref short failure +[h264 @ 0x555df32c1f40] mmco: unref short failure +[h264 @ 0x55d956f36840] mmco: unref short failure +[h264 @ 0x55d956f36840] mmco: unref short failure +[h264 @ 0x555df32c1f40] mmco: unref short failure +[h264 @ 0x55d95707f900] mmco: unref short failure +[h264 @ 0x555df32c1f40] mmco: unref short failure +[h264 @ 0x55d956f36840] mmco: unref short failure + [2024-11-27 20:38:11] iteration 299/ 1000 | consumed samples: 19136 | elapsed time per iteration (ms): 64999.4 | throughput per GPU (TFLOP/s/GPU): 118.6 | learning rate: 4.127530E-06 | global batch size: 64 | lm loss: 6.572868E-01 | loss scale: 1.0 | grad norm: 0.853 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555def216fc0] mmco: unref short failure +[h264 @ 0x555def216fc0] mmco: unref short failure +[h264 @ 0x55d957a91640] mmco: unref short failure +[h264 @ 0x55d957a91640] mmco: unref short failure +[h264 @ 0x555ded70ae00] mmco: unref short failure +[h264 @ 0x555ded70ae00] mmco: unref short failure +[h264 @ 0x55d95c2eb200] mmco: unref short failure +[h264 @ 0x55d95c2eb200] mmco: unref short failure +[h264 @ 0x55d95c2eb200] mmco: unref short failure +[h264 @ 0x55d95c2eb200] mmco: unref short failure +[h264 @ 0x555ded70ae00] mmco: unref short failure +[h264 @ 0x555ded70ae00] mmco: unref short failure +[h264 @ 0x555dec836c00] mmco: unref short failure +[h264 @ 0x555dec836c00] mmco: unref short failure +[h264 @ 0x55d956aae040] mmco: unref short failure +[h264 @ 0x55d956aae040] mmco: unref short failure + [2024-11-27 20:39:31] iteration 300/ 1000 | consumed samples: 19200 | elapsed time per iteration (ms): 79891.6 | throughput per GPU (TFLOP/s/GPU): 96.5 | learning rate: 4.121450E-06 | global batch size: 64 | lm loss: 7.114632E-01 | loss scale: 1.0 | grad norm: 0.939 | number of skipped iterations: 0 | number of nan iterations: 0 | +(min, max) time across ranks (ms): + save-checkpoint ................................: (206621.70, 206622.09) + [2024-11-27 20:44:44] iteration 301/ 1000 | consumed samples: 19264 | elapsed time per iteration (ms): 106072.5 | throughput per GPU (TFLOP/s/GPU): 72.7 | learning rate: 4.115354E-06 | global batch size: 64 | lm loss: 7.603645E-01 | loss scale: 1.0 | grad norm: 0.897 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555df32c1f40] mmco: unref short failure +[h264 @ 0x55d956f36840] mmco: unref short failure +[h264 @ 0x55d956767080] mmco: unref short failure +[h264 @ 0x555dec6aab00] mmco: unref short failure +[h264 @ 0x555dec6aab00] mmco: unref short failure +[h264 @ 0x555dec6aab00] mmco: unref short failure +[h264 @ 0x55d956767080] mmco: unref short failure +[h264 @ 0x55d956767080] mmco: unref short failure +[h264 @ 0x55d956767080] mmco: unref short failure +[h264 @ 0x55d956767080] mmco: unref short failure +[h264 @ 0x555dec6aab00] mmco: unref short failure +[h264 @ 0x555dec6aab00] mmco: unref short failure +[h264 @ 0x55d956767080] mmco: unref short failure +[h264 @ 0x555ded0b5480] mmco: unref short failure +[h264 @ 0x55d956f36840] mmco: unref short failure +[h264 @ 0x555df32c1f40] mmco: unref short failure +[h264 @ 0x55d956767080] mmco: unref short failure +[h264 @ 0x55d956767080] mmco: unref short failure +[h264 @ 0x555ded0b5480] mmco: unref short failure +[h264 @ 0x555ded0b5480] mmco: unref short failure +[h264 @ 0x55d956f537c0] mmco: unref short failure +[h264 @ 0x555ded0fd000] mmco: unref short failure +[h264 @ 0x55d956f537c0] mmco: unref short failure +[h264 @ 0x555ded0fd000] mmco: unref short failure +[h264 @ 0x55d956f36840] mmco: unref short failure +[h264 @ 0x555df32c1f40] mmco: unref short failure +[h264 @ 0x55d956767080] mmco: unref short failure +[h264 @ 0x55d956767080] mmco: unref short failure +[h264 @ 0x555ded0b5480] mmco: unref short failure +[h264 @ 0x555ded0b5480] mmco: unref short failure +[h264 @ 0x55d956767080] mmco: unref short failure +[h264 @ 0x555ded0b5480] mmco: unref short failure +[h264 @ 0x55d95707f900] mmco: unref short failure +[h264 @ 0x55d95707f900] mmco: unref short failure +[h264 @ 0x555df32c1f40] mmco: unref short failure +[h264 @ 0x555df32c1f40] mmco: unref short failure +[h264 @ 0x55d956f36840] mmco: unref short failure +[h264 @ 0x555df32c1f40] mmco: unref short failure +[h264 @ 0x55d956f537c0] mmco: unref short failure +[h264 @ 0x555ded0fd000] mmco: unref short failure +[h264 @ 0x555dec836c00] mmco: unref short failure +[h264 @ 0x55d955b8bcc0] mmco: unref short failure +[h264 @ 0x555df32c1f40] mmco: unref short failure +[h264 @ 0x55d956f36840] mmco: unref short failure +[h264 @ 0x555dec836c00] mmco: unref short failure +[h264 @ 0x555dec836c00] mmco: unref short failure +[h264 @ 0x55d955b8bcc0] mmco: unref short failure +[h264 @ 0x55d955b8bcc0] mmco: unref short failure +[h264 @ 0x555df32c1f40] mmco: unref short failure +[h264 @ 0x555df32c1f40] mmco: unref short failure +[h264 @ 0x55d956f36840] mmco: unref short failure +[h264 @ 0x55d956f36840] mmco: unref short failure + [2024-11-27 20:46:07] iteration 302/ 1000 | consumed samples: 19328 | elapsed time per iteration (ms): 83038.7 | throughput per GPU (TFLOP/s/GPU): 92.8 | learning rate: 4.109242E-06 | global batch size: 64 | lm loss: 6.639850E-01 | loss scale: 1.0 | grad norm: 0.974 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x55d9559dfc40] mmco: unref short failure +[h264 @ 0x55d956f36840] mmco: unref short failure +[h264 @ 0x555df32c1f40] mmco: unref short failure +[h264 @ 0x55d956f36840] mmco: unref short failure +[h264 @ 0x555df32c1f40] mmco: unref short failure +[h264 @ 0x555dedfc6580] mmco: unref short failure +[h264 @ 0x555dedfc6580] mmco: unref short failure +[h264 @ 0x55d95613c240] mmco: unref short failure +[h264 @ 0x55d95613c240] mmco: unref short failure +[h264 @ 0x555dedfc6580] mmco: unref short failure +[h264 @ 0x555dedfc6580] mmco: unref short failure +[h264 @ 0x555dedfc6580] mmco: unref short failure +[h264 @ 0x555dedfc6580] mmco: unref short failure +[h264 @ 0x55d95613c240] mmco: unref short failure +[h264 @ 0x55d95613c240] mmco: unref short failure +[h264 @ 0x55d95613c240] mmco: unref short failure +[h264 @ 0x55d95613c240] mmco: unref short failure +[h264 @ 0x555decb36640] mmco: unref short failure +[h264 @ 0x55d956f7b580] mmco: unref short failure +[h264 @ 0x555dedfc6580] mmco: unref short failure +[h264 @ 0x555dedfc6580] mmco: unref short failure +[h264 @ 0x55d956f36840] mmco: unref short failure +[h264 @ 0x55d956f36840] mmco: unref short failure +[h264 @ 0x55d95613c240] mmco: unref short failure +[h264 @ 0x55d95613c240] mmco: unref short failure +[h264 @ 0x555df32c1f40] mmco: unref short failure +[h264 @ 0x555df32c1f40] mmco: unref short failure +[h264 @ 0x555dedfc6580] mmco: unref short failure +[h264 @ 0x55d95613c240] mmco: unref short failure +[h264 @ 0x55d956f36840] mmco: unref short failure +[h264 @ 0x55d956f36840] mmco: unref short failure +[h264 @ 0x555df32c1f40] mmco: unref short failure +[h264 @ 0x555df32c1f40] mmco: unref short failure +[h264 @ 0x55d956f36840] mmco: unref short failure +[h264 @ 0x555df32c1f40] mmco: unref short failure +[h264 @ 0x555decb36640] mmco: unref short failure +[h264 @ 0x555decb36640] mmco: unref short failure +[h264 @ 0x55d956b9c3c0] mmco: unref short failure +[h264 @ 0x55d956b9c3c0] mmco: unref short failure + [2024-11-27 20:47:19] iteration 303/ 1000 | consumed samples: 19392 | elapsed time per iteration (ms): 72130.6 | throughput per GPU (TFLOP/s/GPU): 106.9 | learning rate: 4.103113E-06 | global batch size: 64 | lm loss: 6.722466E-01 | loss scale: 1.0 | grad norm: 1.019 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x55d955e4acc0] mmco: unref short failure +[h264 @ 0x55d955e4acc0] mmco: unref short failure +[h264 @ 0x555dedfc6580] mmco: unref short failure +[h264 @ 0x555dedfc6580] mmco: unref short failure +[h264 @ 0x555dedfc6580] mmco: unref short failure +[h264 @ 0x55d955e4acc0] mmco: unref short failure +[h264 @ 0x55d955e4acc0] mmco: unref short failure +[h264 @ 0x555dedfc6580] mmco: unref short failure +[h264 @ 0x55d955e4acc0] mmco: unref short failure +[h264 @ 0x55d955e4acc0] mmco: unref short failure +[h264 @ 0x555dedfc6580] mmco: unref short failure +[h264 @ 0x555dedfc6580] mmco: unref short failure + [2024-11-27 20:49:15] iteration 304/ 1000 | consumed samples: 19456 | elapsed time per iteration (ms): 116054.6 | throughput per GPU (TFLOP/s/GPU): 66.4 | learning rate: 4.096968E-06 | global batch size: 64 | lm loss: 6.590104E-01 | loss scale: 1.0 | grad norm: 0.740 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x55d9559dfc40] mmco: unref short failure +[h264 @ 0x55d9559dfc40] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x55d9559dfc40] mmco: unref short failure +[h264 @ 0x55d9559dfc40] mmco: unref short failure +[h264 @ 0x55d9559dfc40] mmco: unref short failure + [2024-11-27 20:51:10] iteration 305/ 1000 | consumed samples: 19520 | elapsed time per iteration (ms): 114899.9 | throughput per GPU (TFLOP/s/GPU): 67.1 | learning rate: 4.090807E-06 | global batch size: 64 | lm loss: 6.442765E-01 | loss scale: 1.0 | grad norm: 1.196 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x55d956f36840] mmco: unref short failure +[h264 @ 0x555df32c1f40] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x555dee6ec240] mmco: unref short failure +[h264 @ 0x55d9559dfc40] mmco: unref short failure +[h264 @ 0x55d9559dfc40] mmco: unref short failure +[h264 @ 0x55d955b8bcc0] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x55d9559dfc40] mmco: unref short failure +[h264 @ 0x55d9559dfc40] mmco: unref short failure +[h264 @ 0x55d9559dfc40] mmco: unref short failure +[h264 @ 0x555dee6ec240] mmco: unref short failure +[h264 @ 0x555dee6ec240] mmco: unref short failure +[h264 @ 0x55d955b8bcc0] mmco: unref short failure +[h264 @ 0x55d955b8bcc0] mmco: unref short failure + [2024-11-27 20:52:19] iteration 306/ 1000 | consumed samples: 19584 | elapsed time per iteration (ms): 69105.8 | throughput per GPU (TFLOP/s/GPU): 111.5 | learning rate: 4.084630E-06 | global batch size: 64 | lm loss: 6.731645E-01 | loss scale: 1.0 | grad norm: 0.876 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555ded696c00] mmco: unref short failure +[h264 @ 0x55d956767080] mmco: unref short failure +[h264 @ 0x555df32c1f40] mmco: unref short failure +[h264 @ 0x55d95707f900] mmco: unref short failure +[h264 @ 0x555df32c1f40] mmco: unref short failure +[h264 @ 0x55d95707f900] mmco: unref short failure +[h264 @ 0x555df32c1f40] mmco: unref short failure +[h264 @ 0x55d95707f900] mmco: unref short failure +[h264 @ 0x555ded679600] mmco: unref short failure +[h264 @ 0x555ded679600] mmco: unref short failure +[h264 @ 0x55d95678a600] mmco: unref short failure +[h264 @ 0x55d95678a600] mmco: unref short failure + [2024-11-27 20:53:37] iteration 307/ 1000 | consumed samples: 19648 | elapsed time per iteration (ms): 78406.8 | throughput per GPU (TFLOP/s/GPU): 98.3 | learning rate: 4.078436E-06 | global batch size: 64 | lm loss: 6.682714E-01 | loss scale: 1.0 | grad norm: 0.812 | number of skipped iterations: 0 | number of nan iterations: 0 | + [2024-11-27 20:55:17] iteration 308/ 1000 | consumed samples: 19712 | elapsed time per iteration (ms): 99738.4 | throughput per GPU (TFLOP/s/GPU): 77.3 | learning rate: 4.072227E-06 | global batch size: 64 | lm loss: 6.655051E-01 | loss scale: 1.0 | grad norm: 0.865 | number of skipped iterations: 0 | number of nan iterations: 0 | + [2024-11-27 20:56:54] iteration 309/ 1000 | consumed samples: 19776 | elapsed time per iteration (ms): 96419.7 | throughput per GPU (TFLOP/s/GPU): 79.9 | learning rate: 4.066001E-06 | global batch size: 64 | lm loss: 7.021908E-01 | loss scale: 1.0 | grad norm: 0.821 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x55d9559dfc40] mmco: unref short failure +[h264 @ 0x55d9559dfc40] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x55d9559dfc40] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x55d9559dfc40] mmco: unref short failure +[h264 @ 0x55d9559dfc40] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x55d9559dfc40] mmco: unref short failure +[h264 @ 0x555ded679600] mmco: unref short failure +[h264 @ 0x55d957cd6140] mmco: unref short failure +[h264 @ 0x555ded679600] mmco: unref short failure +[h264 @ 0x555ded679600] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x55d957cd6140] mmco: unref short failure +[h264 @ 0x55d957cd6140] mmco: unref short failure +[h264 @ 0x555ded679600] mmco: unref short failure +[h264 @ 0x55d9559dfc40] mmco: unref short failure +[h264 @ 0x55d9559dfc40] mmco: unref short failure +[h264 @ 0x55d957cd6140] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x55d9559dfc40] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x55d9559dfc40] mmco: unref short failure +[h264 @ 0x55d9559dfc40] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x55d9559dfc40] mmco: unref short failure +[h264 @ 0x55d9559dfc40] mmco: unref short failure +[h264 @ 0x555ded679600] mmco: unref short failure +[h264 @ 0x55d957cd6140] mmco: unref short failure +[h264 @ 0x555ded0b5480] mmco: unref short failure +[h264 @ 0x555ded0b5480] mmco: unref short failure +[h264 @ 0x55d956b88f40] mmco: unref short failure +[h264 @ 0x55d956b88f40] mmco: unref short failure +[h264 @ 0x555ded0b5480] mmco: unref short failure +[h264 @ 0x555ded0b5480] mmco: unref short failure +[h264 @ 0x55d956b88f40] mmco: unref short failure +[h264 @ 0x55d956b88f40] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x55d9559dfc40] mmco: unref short failure +[h264 @ 0x555ded679600] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x55d957cd6140] mmco: unref short failure +[h264 @ 0x55d9559dfc40] mmco: unref short failure +[h264 @ 0x555ded0b5480] mmco: unref short failure +[h264 @ 0x555ded0b5480] mmco: unref short failure +[h264 @ 0x55d956b88f40] mmco: unref short failure +[h264 @ 0x55d956b88f40] mmco: unref short failure +[h264 @ 0x555ded679600] mmco: unref short failure +[h264 @ 0x55d957cd6140] mmco: unref short failure + [2024-11-27 20:58:10] iteration 310/ 1000 | consumed samples: 19840 | elapsed time per iteration (ms): 75939.3 | throughput per GPU (TFLOP/s/GPU): 101.5 | learning rate: 4.059760E-06 | global batch size: 64 | lm loss: 6.953746E-01 | loss scale: 1.0 | grad norm: 0.884 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x55d956f98f80] mmco: unref short failure +[h264 @ 0x55d956f98f80] mmco: unref short failure +[h264 @ 0x555dedbd1740] mmco: unref short failure +[h264 @ 0x555dedbd1740] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x555dee570b40] mmco: unref short failure +[h264 @ 0x55d95873d100] mmco: unref short failure +[h264 @ 0x555dee570b40] mmco: unref short failure +[h264 @ 0x55d95873d100] mmco: unref short failure +[h264 @ 0x55d95873d100] mmco: unref short failure + [2024-11-27 20:59:21] iteration 311/ 1000 | consumed samples: 19904 | elapsed time per iteration (ms): 71031.9 | throughput per GPU (TFLOP/s/GPU): 108.5 | learning rate: 4.053503E-06 | global batch size: 64 | lm loss: 6.717713E-01 | loss scale: 1.0 | grad norm: 0.991 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555dec836c00] mmco: unref short failure +[h264 @ 0x55d957cd6140] mmco: unref short failure + [2024-11-27 21:00:32] iteration 312/ 1000 | consumed samples: 19968 | elapsed time per iteration (ms): 71770.0 | throughput per GPU (TFLOP/s/GPU): 107.4 | learning rate: 4.047230E-06 | global batch size: 64 | lm loss: 6.943258E-01 | loss scale: 1.0 | grad norm: 0.860 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555dedbd1740] mmco: unref short failure +[h264 @ 0x55d956f98f80] mmco: unref short failure +[h264 @ 0x555dedbd1740] mmco: unref short failure +[h264 @ 0x55d956f98f80] mmco: unref short failure +[h264 @ 0x555dec836c00] mmco: unref short failure +[h264 @ 0x555dec836c00] mmco: unref short failure +[h264 @ 0x55d955b8bcc0] mmco: unref short failure +[h264 @ 0x55d955b8bcc0] mmco: unref short failure +[h264 @ 0x555dec836c00] mmco: unref short failure +[h264 @ 0x555dec836c00] mmco: unref short failure +[h264 @ 0x55d955b8bcc0] mmco: unref short failure +[h264 @ 0x55d955b8bcc0] mmco: unref short failure +[h264 @ 0x555ded679600] mmco: unref short failure +[h264 @ 0x555ded679600] mmco: unref short failure +[h264 @ 0x55d956b88f40] mmco: unref short failure +[h264 @ 0x55d956b88f40] mmco: unref short failure +[h264 @ 0x555dec836c00] mmco: unref short failure +[h264 @ 0x55d955b8bcc0] mmco: unref short failure +[h264 @ 0x555df32c1f40] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure + [2024-11-27 21:02:08] iteration 313/ 1000 | consumed samples: 20032 | elapsed time per iteration (ms): 95766.8 | throughput per GPU (TFLOP/s/GPU): 80.5 | learning rate: 4.040941E-06 | global batch size: 64 | lm loss: 6.254554E-01 | loss scale: 1.0 | grad norm: 0.882 | number of skipped iterations: 0 | number of nan iterations: 0 | + [2024-11-27 21:03:29] iteration 314/ 1000 | consumed samples: 20096 | elapsed time per iteration (ms): 80604.1 | throughput per GPU (TFLOP/s/GPU): 95.6 | learning rate: 4.034637E-06 | global batch size: 64 | lm loss: 6.780632E-01 | loss scale: 1.0 | grad norm: 0.923 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555dec836c00] mmco: unref short failure +[h264 @ 0x555dec836c00] mmco: unref short failure +[h264 @ 0x55d95873d100] mmco: unref short failure +[h264 @ 0x55d95873d100] mmco: unref short failure + [2024-11-27 21:04:49] iteration 315/ 1000 | consumed samples: 20160 | elapsed time per iteration (ms): 80301.8 | throughput per GPU (TFLOP/s/GPU): 96.0 | learning rate: 4.028317E-06 | global batch size: 64 | lm loss: 6.939998E-01 | loss scale: 1.0 | grad norm: 1.131 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555dec836c00] mmco: unref short failure +[h264 @ 0x555dec836c00] mmco: unref short failure +[h264 @ 0x55d95873d100] mmco: unref short failure +[h264 @ 0x55d95873d100] mmco: unref short failure +[h264 @ 0x555dec836c00] mmco: unref short failure +[h264 @ 0x555dec836c00] mmco: unref short failure +[h264 @ 0x55d95873d100] mmco: unref short failure +[h264 @ 0x55d95873d100] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x55d957fbfd40] mmco: unref short failure +[h264 @ 0x55d957fbfd40] mmco: unref short failure +[h264 @ 0x555dec836c00] mmco: unref short failure +[h264 @ 0x555dec836c00] mmco: unref short failure +[h264 @ 0x555dedb7a200] mmco: unref short failure +[h264 @ 0x555dedb7a200] mmco: unref short failure +[h264 @ 0x55d95873d100] mmco: unref short failure +[h264 @ 0x55d95873d100] mmco: unref short failure +[h264 @ 0x555dedb7a200] mmco: unref short failure +[h264 @ 0x555dedb7a200] mmco: unref short failure +[h264 @ 0x55d95b718d00] mmco: unref short failure +[h264 @ 0x55d95b718d00] mmco: unref short failure +[h264 @ 0x55d95b718d00] mmco: unref short failure +[h264 @ 0x55d95b718d00] mmco: unref short failure +[h264 @ 0x555dec836c00] mmco: unref short failure +[h264 @ 0x555dec836c00] mmco: unref short failure +[h264 @ 0x55d95873d100] mmco: unref short failure +[h264 @ 0x55d95873d100] mmco: unref short failure +[h264 @ 0x555deee52400] mmco: unref short failure +[h264 @ 0x555deee52400] mmco: unref short failure +[h264 @ 0x55d955b8bcc0] mmco: unref short failure +[h264 @ 0x55d955b8bcc0] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x555dee7c5a00] mmco: unref short failure +[h264 @ 0x555dee7c5a00] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x555dee7c5a00] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x555dee7c5a00] mmco: unref short failure +[h264 @ 0x555dee7c5a00] mmco: unref short failure + [2024-11-27 21:06:33] iteration 316/ 1000 | consumed samples: 20224 | elapsed time per iteration (ms): 103987.9 | throughput per GPU (TFLOP/s/GPU): 74.1 | learning rate: 4.021981E-06 | global batch size: 64 | lm loss: 6.664628E-01 | loss scale: 1.0 | grad norm: 1.027 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555dee570b40] mmco: unref short failure +[h264 @ 0x55d95873d100] mmco: unref short failure +[h264 @ 0x555dedb7a200] mmco: unref short failure +[h264 @ 0x55d95b718d00] mmco: unref short failure +[h264 @ 0x555dee570b40] mmco: unref short failure +[h264 @ 0x55d95873d100] mmco: unref short failure +[h264 @ 0x555dee570b40] mmco: unref short failure +[h264 @ 0x555dee570b40] mmco: unref short failure +[h264 @ 0x55d95873d100] mmco: unref short failure +[h264 @ 0x55d95873d100] mmco: unref short failure +[h264 @ 0x555dec8de840] mmco: unref short failure +[h264 @ 0x555dec8de840] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x55d955b8bcc0] mmco: unref short failure +[h264 @ 0x55d955b8bcc0] mmco: unref short failure +[h264 @ 0x55d955b8bcc0] mmco: unref short failure +[h264 @ 0x55d955b8bcc0] mmco: unref short failure +[h264 @ 0x55d955b8bcc0] mmco: unref short failure +[h264 @ 0x555dec8de840] mmco: unref short failure +[h264 @ 0x55d955b8bcc0] mmco: unref short failure +[h264 @ 0x555dec8de840] mmco: unref short failure +[h264 @ 0x55d955b8bcc0] mmco: unref short failure +[h264 @ 0x555dec8de840] mmco: unref short failure +[h264 @ 0x55d955b8bcc0] mmco: unref short failure +[h264 @ 0x555dec8de840] mmco: unref short failure +[h264 @ 0x55d955b8bcc0] mmco: unref short failure +[h264 @ 0x555df0f7cd00] mmco: unref short failure +[h264 @ 0x555df0f7cd00] mmco: unref short failure +[h264 @ 0x55d955b8bcc0] mmco: unref short failure +[h264 @ 0x55d955b8bcc0] mmco: unref short failure + [2024-11-27 21:08:17] iteration 317/ 1000 | consumed samples: 20288 | elapsed time per iteration (ms): 103536.3 | throughput per GPU (TFLOP/s/GPU): 74.5 | learning rate: 4.015630E-06 | global batch size: 64 | lm loss: 6.802962E-01 | loss scale: 1.0 | grad norm: 0.841 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x55d956defa80] mmco: unref short failure +[h264 @ 0x555dec836c00] mmco: unref short failure + [2024-11-27 21:09:56] iteration 318/ 1000 | consumed samples: 20352 | elapsed time per iteration (ms): 98927.0 | throughput per GPU (TFLOP/s/GPU): 77.9 | learning rate: 4.009264E-06 | global batch size: 64 | lm loss: 6.827909E-01 | loss scale: 1.0 | grad norm: 1.091 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x55d957fbfd40] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x55d957fbfd40] mmco: unref short failure +[h264 @ 0x55d957fbfd40] mmco: unref short failure + [2024-11-27 21:11:27] iteration 319/ 1000 | consumed samples: 20416 | elapsed time per iteration (ms): 91131.2 | throughput per GPU (TFLOP/s/GPU): 84.6 | learning rate: 4.002883E-06 | global batch size: 64 | lm loss: 7.089035E-01 | loss scale: 1.0 | grad norm: 0.982 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x55d955b8bcc0] mmco: unref short failure +[h264 @ 0x55d955b8bcc0] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x55d955b8bcc0] mmco: unref short failure +[h264 @ 0x55d957fbfd40] mmco: unref short failure +[h264 @ 0x55d957fbfd40] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x55d957fbfd40] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x55d957fbfd40] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure + [2024-11-27 21:12:55] iteration 320/ 1000 | consumed samples: 20480 | elapsed time per iteration (ms): 87896.1 | throughput per GPU (TFLOP/s/GPU): 87.7 | learning rate: 3.996486E-06 | global batch size: 64 | lm loss: 7.206851E-01 | loss scale: 1.0 | grad norm: 0.957 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555ded7e1040] mmco: unref short failure +[h264 @ 0x55d956b4f800] mmco: unref short failure +[h264 @ 0x555ded7e1040] mmco: unref short failure +[h264 @ 0x555ded7e1040] mmco: unref short failure +[h264 @ 0x55d956b4f800] mmco: unref short failure +[h264 @ 0x55d956b4f800] mmco: unref short failure +[h264 @ 0x555ded7e1040] mmco: unref short failure +[h264 @ 0x555ded7e1040] mmco: unref short failure +[h264 @ 0x55d956b4f800] mmco: unref short failure +[h264 @ 0x55d956b4f800] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x55d955b8bcc0] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x55d955b8bcc0] mmco: unref short failure +[h264 @ 0x555dee6ec240] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x555dee6ec240] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x55d955acf300] [h264 @ 0x555dee6ec240] mmco: unref short failure +mmco: unref short failure +[h264 @ 0x555dee6ec240] mmco: unref short failure +[h264 @ 0x555dee6ec240] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x555dee6ec240] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x555dee6ec240] mmco: unref short failure +[h264 @ 0x555dee6ec240] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x555dee6ec240] mmco: unref short failure +[h264 @ 0x555dee6ec240] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x555dee6ec240] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x555dee6ec240] mmco: unref short failure +[h264 @ 0x555dee6ec240] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x555dee6ec240] mmco: unref short failure +[h264 @ 0x555dee6ec240] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x555dee6ec240] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x555dee6ec240] mmco: unref short failure + [2024-11-27 21:14:09] iteration 321/ 1000 | consumed samples: 20544 | elapsed time per iteration (ms): 74712.4 | throughput per GPU (TFLOP/s/GPU): 103.2 | learning rate: 3.990074E-06 | global batch size: 64 | lm loss: 6.479557E-01 | loss scale: 1.0 | grad norm: 1.319 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x55d957a03080] mmco: unref short failure +[h264 @ 0x555dedacf800] mmco: unref short failure + [2024-11-27 21:15:33] iteration 322/ 1000 | consumed samples: 20608 | elapsed time per iteration (ms): 83255.3 | throughput per GPU (TFLOP/s/GPU): 92.6 | learning rate: 3.983647E-06 | global batch size: 64 | lm loss: 6.448869E-01 | loss scale: 1.0 | grad norm: 0.902 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555ded7e1040] mmco: unref short failure +[h264 @ 0x555ded7e1040] mmco: unref short failure +[h264 @ 0x555ded7e1040] mmco: unref short failure +[h264 @ 0x555ded7e1040] mmco: unref short failure +[h264 @ 0x55d956b9c3c0] mmco: unref short failure +[h264 @ 0x55d956b9c3c0] mmco: unref short failure +[h264 @ 0x55d956b9c3c0] mmco: unref short failure +[h264 @ 0x55d956b9c3c0] mmco: unref short failure +[h264 @ 0x555dedacf800] mmco: unref short failure +[h264 @ 0x555dedacf800] mmco: unref short failure +[h264 @ 0x55d956defa80] mmco: unref short failure +[h264 @ 0x55d956defa80] mmco: unref short failure +[h264 @ 0x555dedacf800] mmco: unref short failure +[h264 @ 0x55d956defa80] mmco: unref short failure +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x55d95c76c1c0] mmco: unref short failure +[h264 @ 0x55d95c76c1c0] mmco: unref short failure +[h264 @ 0x55d95c76c1c0] mmco: unref short failure +[h264 @ 0x55d95c76c1c0] mmco: unref short failure +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x55d95c76c1c0] mmco: unref short failure +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x55d95c76c1c0] mmco: unref short failure +[h264 @ 0x55d95c76c1c0] mmco: unref short failure +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x55d95c76c1c0] mmco: unref short failure +[h264 @ 0x55d95c76c1c0] mmco: unref short failure +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x55d95c76c1c0] mmco: unref short failure +[h264 @ 0x55d95c76c1c0] mmco: unref short failure +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x55d95c76c1c0] mmco: unref short failure +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x55d95c76c1c0] mmco: unref short failure +[h264 @ 0x55d95c76c1c0] mmco: unref short failure +[h264 @ 0x555deee52400] mmco: unref short failure +[h264 @ 0x55d957a03080] mmco: unref short failure +[h264 @ 0x55d957a03080] mmco: unref short failure +[h264 @ 0x55d957a03080] mmco: unref short failure +[h264 @ 0x555deee52400] mmco: unref short failure +[h264 @ 0x555deee52400] mmco: unref short failure +[h264 @ 0x555deee52400] mmco: unref short failure +[h264 @ 0x555deee52400] mmco: unref short failure +[h264 @ 0x55d957a03080] mmco: unref short failure +[h264 @ 0x55d957a03080] mmco: unref short failure + [2024-11-27 21:16:50] iteration 323/ 1000 | consumed samples: 20672 | elapsed time per iteration (ms): 77458.4 | throughput per GPU (TFLOP/s/GPU): 99.5 | learning rate: 3.977205E-06 | global batch size: 64 | lm loss: 7.210954E-01 | loss scale: 1.0 | grad norm: 0.864 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x55d9565f0700] mmco: unref short failure +[h264 @ 0x55d9565f0700] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x55d9565f0700] mmco: unref short failure +[h264 @ 0x55d9565f0700] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x55d9565f0700] mmco: unref short failure +[h264 @ 0x55d9565f0700] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x55d9565f0700] mmco: unref short failure +[h264 @ 0x55d9565f0700] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x55d9565f0700] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x55d9565f0700] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x55d9565f0700] mmco: unref short failure +[h264 @ 0x55d9565f0700] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x555dee6ec240] Missing reference picture, default is 65530 +[h264 @ 0x555dee6ec240] mmco: unref short failure +[h264 @ 0x555dee6ec240] mmco: unref short failure +[h264 @ 0x55d955acf300] Missing reference picture, default is 65530 +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x555ded7e1040] mmco: unref short failure +[h264 @ 0x555ded7e1040] mmco: unref short failure +[h264 @ 0x55d956defa80] mmco: unref short failure +[h264 @ 0x55d956defa80] mmco: unref short failure +[h264 @ 0x555ded7e1040] mmco: unref short failure +[h264 @ 0x55d956defa80] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x55d9565f0700] mmco: unref short failure +[h264 @ 0x55d9565f0700] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x55d9565f0700] mmco: unref short failure +[h264 @ 0x55d956defa80] mmco: unref short failure +[h264 @ 0x555deee52400] mmco: unref short failure +[h264 @ 0x55d956defa80] mmco: unref short failure +[h264 @ 0x555deee52400] mmco: unref short failure + [2024-11-27 21:18:32] iteration 324/ 1000 | consumed samples: 20736 | elapsed time per iteration (ms): 102000.0 | throughput per GPU (TFLOP/s/GPU): 75.6 | learning rate: 3.970748E-06 | global batch size: 64 | lm loss: 6.985208E-01 | loss scale: 1.0 | grad norm: 0.817 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x55d955b8bcc0] mmco: unref short failure +[h264 @ 0x55d955b8bcc0] mmco: unref short failure +[h264 @ 0x555dee700e80] mmco: unref short failure +[h264 @ 0x555dee700e80] mmco: unref short failure +[h264 @ 0x555ded70ae00] mmco: unref short failure +[h264 @ 0x555ded70ae00] mmco: unref short failure +[h264 @ 0x55d955b8bcc0] mmco: unref short failure +[h264 @ 0x55d955b8bcc0] mmco: unref short failure +[h264 @ 0x555ded7e1040] mmco: unref short failure +[h264 @ 0x555ded7e1040] mmco: unref short failure +[h264 @ 0x55d956b9c3c0] mmco: unref short failure +[h264 @ 0x55d956b9c3c0] mmco: unref short failure +[h264 @ 0x555dee700e80] mmco: unref short failure +[h264 @ 0x555dee700e80] mmco: unref short failure +[h264 @ 0x555dee700e80] mmco: unref short failure +[h264 @ 0x55d955b8bcc0] mmco: unref short failure +[h264 @ 0x55d955b8bcc0] mmco: unref short failure +[h264 @ 0x55d955b8bcc0] mmco: unref short failure +[h264 @ 0x555ded7e1040] mmco: unref short failure +[h264 @ 0x55d956b9c3c0] mmco: unref short failure +[h264 @ 0x555ded7e1040] mmco: unref short failure +[h264 @ 0x555ded7e1040] mmco: unref short failure +[h264 @ 0x55d956b9c3c0] mmco: unref short failure +[h264 @ 0x55d956b9c3c0] mmco: unref short failure + [2024-11-27 21:19:45] iteration 325/ 1000 | consumed samples: 20800 | elapsed time per iteration (ms): 73014.5 | throughput per GPU (TFLOP/s/GPU): 105.6 | learning rate: 3.964276E-06 | global batch size: 64 | lm loss: 7.221389E-01 | loss scale: 1.0 | grad norm: 0.853 | number of skipped iterations: 0 | number of nan iterations: 0 | + [2024-11-27 21:21:08] iteration 326/ 1000 | consumed samples: 20864 | elapsed time per iteration (ms): 82786.4 | throughput per GPU (TFLOP/s/GPU): 93.1 | learning rate: 3.957789E-06 | global batch size: 64 | lm loss: 7.336783E-01 | loss scale: 1.0 | grad norm: 0.924 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555df32c1f40] mmco: unref short failure +[h264 @ 0x555df32c1f40] mmco: unref short failure +[h264 @ 0x555df32c1f40] mmco: unref short failure +[h264 @ 0x555df32c1f40] mmco: unref short failure +[h264 @ 0x555df32c1f40] mmco: unref short failure +[h264 @ 0x55d95707f900] mmco: unref short failure +[h264 @ 0x55d95707f900] mmco: unref short failure +[h264 @ 0x555dee6ec240] mmco: unref short failure +[h264 @ 0x555dee6ec240] mmco: unref short failure +[h264 @ 0x55d95707f900] mmco: unref short failure +[h264 @ 0x55d95707f900] mmco: unref short failure +[h264 @ 0x555df32c1f40] mmco: unref short failure +[h264 @ 0x55d95707f900] mmco: unref short failure +[h264 @ 0x55d957fe7780] mmco: unref short failure +[h264 @ 0x55d957fe7780] mmco: unref short failure +[h264 @ 0x555dee700e80] mmco: unref short failure +[h264 @ 0x55d95707f900] mmco: unref short failure +[h264 @ 0x555dee700e80] mmco: unref short failure +[h264 @ 0x55d955b8bcc0] mmco: unref short failure +[h264 @ 0x55d955b8bcc0] mmco: unref short failure +[h264 @ 0x555dee6ec240] mmco: unref short failure +[h264 @ 0x55d957fe7780] mmco: unref short failure + [2024-11-27 21:22:48] iteration 327/ 1000 | consumed samples: 20928 | elapsed time per iteration (ms): 99754.9 | throughput per GPU (TFLOP/s/GPU): 77.3 | learning rate: 3.951287E-06 | global batch size: 64 | lm loss: 7.123046E-01 | loss scale: 1.0 | grad norm: 0.888 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555def216fc0] mmco: unref short failure +[h264 @ 0x55d956042040] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x55d9565f0700] mmco: unref short failure +[h264 @ 0x55d9565f0700] mmco: unref short failure +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x55d95c76c1c0] mmco: unref short failure +[h264 @ 0x55d95c76c1c0] mmco: unref short failure +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x55d95c76c1c0] mmco: unref short failure +[h264 @ 0x55d95c76c1c0] mmco: unref short failure +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x55d95c76c1c0] mmco: unref short failure +[h264 @ 0x555ded7e1040] mmco: unref short failure +[h264 @ 0x55d956b9c3c0] mmco: unref short failure +[h264 @ 0x555ded7e1040] mmco: unref short failure +[h264 @ 0x55d956b9c3c0] mmco: unref short failure +[h264 @ 0x55d955b8bcc0] mmco: unref short failure +[h264 @ 0x55d955b8bcc0] mmco: unref short failure +[h264 @ 0x555dee700e80] mmco: unref short failure +[h264 @ 0x555dee700e80] mmco: unref short failure +[h264 @ 0x555ded7e1040] mmco: unref short failure +[h264 @ 0x55d956b9c3c0] mmco: unref short failure +[h264 @ 0x55d956d792c0] mmco: unref short failure +[h264 @ 0x55d956d792c0] mmco: unref short failure +[h264 @ 0x555dee700e80] mmco: unref short failure +[h264 @ 0x555dee700e80] mmco: unref short failure +[h264 @ 0x55d956d792c0] mmco: unref short failure +[h264 @ 0x555dee700e80] mmco: unref short failure +[h264 @ 0x55d956d792c0] mmco: unref short failure +[h264 @ 0x55d956d792c0] mmco: unref short failure +[h264 @ 0x555dee700e80] mmco: unref short failure +[h264 @ 0x555dee700e80] mmco: unref short failure + [2024-11-27 21:24:02] iteration 328/ 1000 | consumed samples: 20992 | elapsed time per iteration (ms): 74610.5 | throughput per GPU (TFLOP/s/GPU): 103.3 | learning rate: 3.944771E-06 | global batch size: 64 | lm loss: 7.396567E-01 | loss scale: 1.0 | grad norm: 1.181 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555def216fc0] mmco: unref short failure +[h264 @ 0x55d95c2eb200] mmco: unref short failure +[h264 @ 0x555ded70ae00] mmco: unref short failure +[h264 @ 0x555ded70ae00] mmco: unref short failure +[h264 @ 0x55d955b8bcc0] mmco: unref short failure +[h264 @ 0x55d955b8bcc0] mmco: unref short failure +[h264 @ 0x55d9575fd4c0] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x55d959cd93c0] mmco: unref short failure +[h264 @ 0x555ded7e1040] mmco: unref short failure +[h264 @ 0x55d957cd6140] mmco: unref short failure + [2024-11-27 21:25:33] iteration 329/ 1000 | consumed samples: 21056 | elapsed time per iteration (ms): 90607.2 | throughput per GPU (TFLOP/s/GPU): 85.1 | learning rate: 3.938240E-06 | global batch size: 64 | lm loss: 6.970116E-01 | loss scale: 1.0 | grad norm: 1.010 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x55d9593709c0] mmco: unref short failure +[h264 @ 0x555def251b40] mmco: unref short failure + [2024-11-27 21:26:51] iteration 330/ 1000 | consumed samples: 21120 | elapsed time per iteration (ms): 77901.1 | throughput per GPU (TFLOP/s/GPU): 99.0 | learning rate: 3.931695E-06 | global batch size: 64 | lm loss: 6.198275E-01 | loss scale: 1.0 | grad norm: 0.902 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x55d957cd6140] mmco: unref short failure +[h264 @ 0x555ded7e1040] mmco: unref short failure +[h264 @ 0x55d957cd6140] mmco: unref short failure +[h264 @ 0x555ded7e1040] mmco: unref short failure +[h264 @ 0x555dee104680] mmco: unref short failure +[h264 @ 0x555dee104680] mmco: unref short failure +[h264 @ 0x55d9560849c0] mmco: unref short failure +[h264 @ 0x55d9560849c0] mmco: unref short failure + [2024-11-27 21:27:59] iteration 331/ 1000 | consumed samples: 21184 | elapsed time per iteration (ms): 67923.3 | throughput per GPU (TFLOP/s/GPU): 113.5 | learning rate: 3.925135E-06 | global batch size: 64 | lm loss: 7.459432E-01 | loss scale: 1.0 | grad norm: 0.829 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555ded7e1040] mmco: unref short failure +[h264 @ 0x55d956b4f800] mmco: unref short failure +[h264 @ 0x555ded7e1040] mmco: unref short failure +[h264 @ 0x555ded7e1040] mmco: unref short failure +[h264 @ 0x555ded7e1040] mmco: unref short failure +[h264 @ 0x55d956b4f800] mmco: unref short failure +[h264 @ 0x55d956b4f800] mmco: unref short failure +[h264 @ 0x55d956b4f800] mmco: unref short failure + [2024-11-27 21:30:51] iteration 332/ 1000 | consumed samples: 21248 | elapsed time per iteration (ms): 172399.5 | throughput per GPU (TFLOP/s/GPU): 44.7 | learning rate: 3.918560E-06 | global batch size: 64 | lm loss: 6.864884E-01 | loss scale: 1.0 | grad norm: 1.143 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x55d959cd93c0] mmco: unref short failure +[h264 @ 0x55d959cd93c0] mmco: unref short failure +[h264 @ 0x55d956b4f800] mmco: unref short failure +[h264 @ 0x55d956b4f800] mmco: unref short failure +[h264 @ 0x555ded696c00] mmco: unref short failure +[h264 @ 0x555ded696c00] mmco: unref short failure + [2024-11-27 21:32:13] iteration 333/ 1000 | consumed samples: 21312 | elapsed time per iteration (ms): 81667.0 | throughput per GPU (TFLOP/s/GPU): 94.4 | learning rate: 3.911972E-06 | global batch size: 64 | lm loss: 6.252856E-01 | loss scale: 1.0 | grad norm: 1.149 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555dee948bc0] mmco: unref short failure +[h264 @ 0x555dee948bc0] mmco: unref short failure +[h264 @ 0x55d95873d100] mmco: unref short failure +[h264 @ 0x55d95873d100] mmco: unref short failure +[h264 @ 0x55d95873d100] mmco: unref short failure +[h264 @ 0x555dee104680] mmco: unref short failure +[h264 @ 0x55d956aae040] mmco: unref short failure +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x55d956aae040] mmco: unref short failure +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x55d956aae040] mmco: unref short failure +[h264 @ 0x55d956aae040] mmco: unref short failure +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x55d956d792c0] mmco: unref short failure +[h264 @ 0x55d956d792c0] mmco: unref short failure +[h264 @ 0x555dedf05880] mmco: unref short failure +[h264 @ 0x555dedf05880] mmco: unref short failure +[h264 @ 0x55d956d792c0] mmco: unref short failure +[h264 @ 0x55d956d792c0] mmco: unref short failure +[h264 @ 0x555ded1fff00] mmco: unref short failure +[h264 @ 0x555ded1fff00] mmco: unref short failure +[h264 @ 0x55d956d792c0] mmco: unref short failure +[h264 @ 0x55d956d792c0] mmco: unref short failure +[h264 @ 0x555dedf05880] mmco: unref short failure +[h264 @ 0x555dedf05880] mmco: unref short failure + [2024-11-27 21:33:50] iteration 334/ 1000 | consumed samples: 21376 | elapsed time per iteration (ms): 97273.8 | throughput per GPU (TFLOP/s/GPU): 79.2 | learning rate: 3.905369E-06 | global batch size: 64 | lm loss: 7.187002E-01 | loss scale: 1.0 | grad norm: 1.079 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555deebbd8c0] mmco: unref short failure +[h264 @ 0x55d956f36e40] mmco: unref short failure + [2024-11-27 21:35:21] iteration 335/ 1000 | consumed samples: 21440 | elapsed time per iteration (ms): 91328.7 | throughput per GPU (TFLOP/s/GPU): 84.4 | learning rate: 3.898751E-06 | global batch size: 64 | lm loss: 6.292923E-01 | loss scale: 1.0 | grad norm: 0.950 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555dee104680] mmco: unref short failure +[h264 @ 0x55d956b9c3c0] mmco: unref short failure + [2024-11-27 21:36:40] iteration 336/ 1000 | consumed samples: 21504 | elapsed time per iteration (ms): 78764.0 | throughput per GPU (TFLOP/s/GPU): 97.9 | learning rate: 3.892120E-06 | global batch size: 64 | lm loss: 6.587684E-01 | loss scale: 1.0 | grad norm: 1.085 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x55d95c2eb200] mmco: unref short failure +[h264 @ 0x55d95c2eb200] mmco: unref short failure +[h264 @ 0x555dee700e80] mmco: unref short failure +[h264 @ 0x555dee700e80] mmco: unref short failure +[h264 @ 0x555dee700e80] mmco: unref short failure +[h264 @ 0x555dee700e80] mmco: unref short failure +[h264 @ 0x55d95c2eb200] mmco: unref short failure +[h264 @ 0x55d95c2eb200] mmco: unref short failure +[h264 @ 0x55d95c2eb200] mmco: unref short failure +[h264 @ 0x55d95c2eb200] mmco: unref short failure +[h264 @ 0x555dee700e80] mmco: unref short failure +[h264 @ 0x555dee700e80] mmco: unref short failure + [2024-11-27 21:38:02] iteration 337/ 1000 | consumed samples: 21568 | elapsed time per iteration (ms): 81405.1 | throughput per GPU (TFLOP/s/GPU): 94.7 | learning rate: 3.885475E-06 | global batch size: 64 | lm loss: 6.140105E-01 | loss scale: 1.0 | grad norm: 0.754 | number of skipped iterations: 0 | number of nan iterations: 0 | + [2024-11-27 21:39:21] iteration 338/ 1000 | consumed samples: 21632 | elapsed time per iteration (ms): 79667.3 | throughput per GPU (TFLOP/s/GPU): 96.8 | learning rate: 3.878815E-06 | global batch size: 64 | lm loss: 6.848626E-01 | loss scale: 1.0 | grad norm: 0.956 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x55d955b8bcc0] mmco: unref short failure +[h264 @ 0x55d955b8bcc0] mmco: unref short failure +[h264 @ 0x555dece9f180] mmco: unref short failure +[h264 @ 0x555dece9f180] mmco: unref short failure +[h264 @ 0x55d959cd93c0] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x55d956f36e40] mmco: unref short failure +[h264 @ 0x55d956f36e40] mmco: unref short failure +[h264 @ 0x555deebbd8c0] mmco: unref short failure +[h264 @ 0x555deebbd8c0] mmco: unref short failure +[h264 @ 0x55d956f36e40] mmco: unref short failure +[h264 @ 0x55d956f36e40] mmco: unref short failure +[h264 @ 0x555deebbd8c0] mmco: unref short failure +[h264 @ 0x555deebbd8c0] mmco: unref short failure +[h264 @ 0x55d95c76c1c0] mmco: unref short failure +[h264 @ 0x555df32c1f40] mmco: unref short failure +[h264 @ 0x55d95c76c1c0] mmco: unref short failure +[h264 @ 0x55d95c76c1c0] mmco: unref short failure +[h264 @ 0x555df32c1f40] mmco: unref short failure +[h264 @ 0x555df32c1f40] mmco: unref short failure +[h264 @ 0x55d959cd93c0] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x555df32c1f40] mmco: unref short failure +[h264 @ 0x555df32c1f40] mmco: unref short failure +[h264 @ 0x555df32c1f40] mmco: unref short failure +[h264 @ 0x555df32c1f40] mmco: unref short failure +[h264 @ 0x555df32c1f40] mmco: unref short failure +[h264 @ 0x55d95c76c1c0] mmco: unref short failure +[h264 @ 0x55d95c76c1c0] mmco: unref short failure +[h264 @ 0x555dee588f00] mmco: unref short failure +[h264 @ 0x55d95c76c1c0] mmco: unref short failure +[h264 @ 0x55d95c76c1c0] mmco: unref short failure +[h264 @ 0x55d95c76c1c0] mmco: unref short failure +[h264 @ 0x55d9593709c0] mmco: unref short failure +[h264 @ 0x555dee588f00] mmco: unref short failure +[h264 @ 0x555dee588f00] mmco: unref short failure +[h264 @ 0x55d9593709c0] mmco: unref short failure +[h264 @ 0x55d9593709c0] mmco: unref short failure +[h264 @ 0x555dee588f00] mmco: unref short failure +[h264 @ 0x555dee588f00] mmco: unref short failure +[h264 @ 0x55d9593709c0] mmco: unref short failure +[h264 @ 0x55d9593709c0] mmco: unref short failure +[h264 @ 0x555dee588f00] mmco: unref short failure +[h264 @ 0x55d9593709c0] mmco: unref short failure +[h264 @ 0x555dee588f00] mmco: unref short failure +[h264 @ 0x555dee588f00] mmco: unref short failure +[h264 @ 0x55d9593709c0] mmco: unref short failure +[h264 @ 0x55d9593709c0] mmco: unref short failure +[h264 @ 0x555dee588f00] mmco: unref short failure +[h264 @ 0x555dee588f00] mmco: unref short failure +[h264 @ 0x55d955d53340] mmco: unref short failure +[h264 @ 0x55d955d53340] mmco: unref short failure +[h264 @ 0x555dee588f00] mmco: unref short failure +[h264 @ 0x555dee588f00] mmco: unref short failure +[h264 @ 0x55d955d53340] mmco: unref short failure +[h264 @ 0x55d955d53340] mmco: unref short failure +[h264 @ 0x555dee588f00] mmco: unref short failure +[h264 @ 0x555dee588f00] mmco: unref short failure +[h264 @ 0x55d955d53340] mmco: unref short failure +[h264 @ 0x55d955d53340] mmco: unref short failure + [2024-11-27 21:40:47] iteration 339/ 1000 | consumed samples: 21696 | elapsed time per iteration (ms): 85169.7 | throughput per GPU (TFLOP/s/GPU): 90.5 | learning rate: 3.872142E-06 | global batch size: 64 | lm loss: 7.561729E-01 | loss scale: 1.0 | grad norm: 0.948 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555deebbd8c0] mmco: unref short failure +[h264 @ 0x555deebbd8c0] mmco: unref short failure +[h264 @ 0x55d9574d90c0] mmco: unref short failure +[h264 @ 0x55d9574d90c0] mmco: unref short failure +[h264 @ 0x555deebbd8c0] mmco: unref short failure +[h264 @ 0x555deebbd8c0] mmco: unref short failure +[h264 @ 0x55d9574d90c0] mmco: unref short failure +[h264 @ 0x55d9574d90c0] mmco: unref short failure +[h264 @ 0x55d9565f0700] mmco: unref short failure +[h264 @ 0x55d9565f0700] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x55d956defa80] mmco: unref short failure +[h264 @ 0x555dee104680] mmco: unref short failure +[h264 @ 0x55d9565f0700] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x55d955d53340] mmco: unref short failure +[h264 @ 0x55d955d53340] mmco: unref short failure +[h264 @ 0x555dee588f00] mmco: unref short failure +[h264 @ 0x555dee588f00] mmco: unref short failure +[h264 @ 0x55d955d53340] mmco: unref short failure +[h264 @ 0x555dee588f00] mmco: unref short failure + [2024-11-27 21:42:38] iteration 340/ 1000 | consumed samples: 21760 | elapsed time per iteration (ms): 111085.9 | throughput per GPU (TFLOP/s/GPU): 69.4 | learning rate: 3.865454E-06 | global batch size: 64 | lm loss: 7.680401E-01 | loss scale: 1.0 | grad norm: 0.944 | number of skipped iterations: 0 | number of nan iterations: 0 | + [2024-11-27 21:45:50] iteration 341/ 1000 | consumed samples: 21824 | elapsed time per iteration (ms): 191908.5 | throughput per GPU (TFLOP/s/GPU): 40.2 | learning rate: 3.858753E-06 | global batch size: 64 | lm loss: 6.925346E-01 | loss scale: 1.0 | grad norm: 0.848 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555dee588f00] mmco: unref short failure +[h264 @ 0x55d955d53340] mmco: unref short failure +[h264 @ 0x55d955d53340] mmco: unref short failure +[h264 @ 0x555dee588f00] mmco: unref short failure +[h264 @ 0x555dee588f00] mmco: unref short failure +[h264 @ 0x55d955d53340] mmco: unref short failure + [2024-11-27 21:47:07] iteration 342/ 1000 | consumed samples: 21888 | elapsed time per iteration (ms): 77534.8 | throughput per GPU (TFLOP/s/GPU): 99.4 | learning rate: 3.852039E-06 | global batch size: 64 | lm loss: 7.017056E-01 | loss scale: 1.0 | grad norm: 0.771 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555decea5e80] mmco: unref short failure +[h264 @ 0x555decea5e80] mmco: unref short failure +[h264 @ 0x55d956b4f800] mmco: unref short failure +[h264 @ 0x55d956b4f800] mmco: unref short failure +[h264 @ 0x555decea5e80] mmco: unref short failure +[h264 @ 0x55d956b4f800] mmco: unref short failure +[h264 @ 0x55d9574d90c0] mmco: unref short failure +[h264 @ 0x55d9574d90c0] mmco: unref short failure +[h264 @ 0x555deebbd8c0] mmco: unref short failure +[h264 @ 0x555deebbd8c0] mmco: unref short failure + [2024-11-27 21:48:40] iteration 343/ 1000 | consumed samples: 21952 | elapsed time per iteration (ms): 92606.0 | throughput per GPU (TFLOP/s/GPU): 83.2 | learning rate: 3.845310E-06 | global batch size: 64 | lm loss: 6.684653E-01 | loss scale: 1.0 | grad norm: 0.886 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555df32c1f40] mmco: unref short failure +[h264 @ 0x55d95c76c1c0] mmco: unref short failure +[h264 @ 0x555df32c1f40] mmco: unref short failure +[h264 @ 0x55d95c76c1c0] mmco: unref short failure +[h264 @ 0x555deee52400] mmco: unref short failure +[h264 @ 0x555deee52400] mmco: unref short failure +[h264 @ 0x55d9560849c0] mmco: unref short failure +[h264 @ 0x55d9560849c0] mmco: unref short failure +[h264 @ 0x555dee700e80] mmco: unref short failure +[h264 @ 0x55d95c2eb200] mmco: unref short failure + [2024-11-27 21:50:24] iteration 344/ 1000 | consumed samples: 22016 | elapsed time per iteration (ms): 104328.5 | throughput per GPU (TFLOP/s/GPU): 73.9 | learning rate: 3.838568E-06 | global batch size: 64 | lm loss: 6.584975E-01 | loss scale: 1.0 | grad norm: 0.806 | number of skipped iterations: 0 | number of nan iterations: 0 | + [2024-11-27 21:51:43] iteration 345/ 1000 | consumed samples: 22080 | elapsed time per iteration (ms): 78557.2 | throughput per GPU (TFLOP/s/GPU): 98.1 | learning rate: 3.831812E-06 | global batch size: 64 | lm loss: 7.187029E-01 | loss scale: 1.0 | grad norm: 0.873 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555decea5e80] mmco: unref short failure +[h264 @ 0x55d956defa80] mmco: unref short failure + [2024-11-27 21:52:58] iteration 346/ 1000 | consumed samples: 22144 | elapsed time per iteration (ms): 75116.0 | throughput per GPU (TFLOP/s/GPU): 102.6 | learning rate: 3.825043E-06 | global batch size: 64 | lm loss: 6.967052E-01 | loss scale: 1.0 | grad norm: 1.003 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555deda7a5c0] mmco: unref short failure +[h264 @ 0x55d955d53340] mmco: unref short failure +[h264 @ 0x55d955d53340] mmco: unref short failure +[h264 @ 0x55d955d53340] mmco: unref short failure +[h264 @ 0x555deda7a5c0] mmco: unref short failure +[h264 @ 0x555deda7a5c0] mmco: unref short failure + [2024-11-27 21:54:41] iteration 347/ 1000 | consumed samples: 22208 | elapsed time per iteration (ms): 103431.0 | throughput per GPU (TFLOP/s/GPU): 74.5 | learning rate: 3.818261E-06 | global batch size: 64 | lm loss: 7.444284E-01 | loss scale: 1.0 | grad norm: 1.182 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555dee700e80] mmco: unref short failure +[h264 @ 0x55d95c2eb200] mmco: unref short failure +[h264 @ 0x55d955d53340] mmco: unref short failure +[h264 @ 0x55d955d53340] mmco: unref short failure +[h264 @ 0x555dee588f00] mmco: unref short failure +[h264 @ 0x555dee588f00] mmco: unref short failure +[h264 @ 0x55d955d53340] mmco: unref short failure +[h264 @ 0x555dee588f00] mmco: unref short failure +[h264 @ 0x555dee588f00] mmco: unref short failure +[h264 @ 0x55d955d53340] mmco: unref short failure +[h264 @ 0x555df4961e80] mmco: unref short failure +[h264 @ 0x55d95a292180] mmco: unref short failure +[h264 @ 0x555df32c1f40] mmco: unref short failure +[h264 @ 0x55d95c76c1c0] mmco: unref short failure +[h264 @ 0x555decea5e80] mmco: unref short failure +[h264 @ 0x555decea5e80] mmco: unref short failure +[h264 @ 0x55d956defa80] mmco: unref short failure +[h264 @ 0x55d956defa80] mmco: unref short failure +[h264 @ 0x555dece4d600] mmco: unref short failure +[h264 @ 0x55d956b88f40] mmco: unref short failure + [2024-11-27 21:56:19] iteration 348/ 1000 | consumed samples: 22272 | elapsed time per iteration (ms): 97685.4 | throughput per GPU (TFLOP/s/GPU): 78.9 | learning rate: 3.811465E-06 | global batch size: 64 | lm loss: 6.478388E-01 | loss scale: 1.0 | grad norm: 0.870 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555dee700e80] mmco: unref short failure +[h264 @ 0x55d95c2eb200] mmco: unref short failure +[h264 @ 0x555def251b40] mmco: unref short failure +[h264 @ 0x555def251b40] mmco: unref short failure +[h264 @ 0x55d956b88f40] mmco: unref short failure +[h264 @ 0x55d956b88f40] mmco: unref short failure +[h264 @ 0x555def251b40] mmco: unref short failure +[h264 @ 0x55d956b88f40] mmco: unref short failure +[h264 @ 0x555def251b40] mmco: unref short failure +[h264 @ 0x555def251b40] mmco: unref short failure +[h264 @ 0x55d956b88f40] mmco: unref short failure +[h264 @ 0x55d956b88f40] mmco: unref short failure +[h264 @ 0x555def251b40] mmco: unref short failure +[h264 @ 0x555def251b40] mmco: unref short failure +[h264 @ 0x55d956b88f40] mmco: unref short failure +[h264 @ 0x55d956b88f40] mmco: unref short failure +[h264 @ 0x555df32c1f40] mmco: unref short failure +[h264 @ 0x555df32c1f40] mmco: unref short failure +[h264 @ 0x555df32c1f40] mmco: unref short failure +[h264 @ 0x555df32c1f40] mmco: unref short failure +[h264 @ 0x55d95c76c1c0] mmco: unref short failure +[h264 @ 0x55d95c76c1c0] mmco: unref short failure +[h264 @ 0x55d95c76c1c0] mmco: unref short failure +[h264 @ 0x55d95c76c1c0] mmco: unref short failure +[h264 @ 0x555dee104680] mmco: unref short failure +[h264 @ 0x555dee104680] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure + [2024-11-27 21:57:59] iteration 349/ 1000 | consumed samples: 22336 | elapsed time per iteration (ms): 100372.1 | throughput per GPU (TFLOP/s/GPU): 76.8 | learning rate: 3.804656E-06 | global batch size: 64 | lm loss: 6.880268E-01 | loss scale: 1.0 | grad norm: 0.730 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555decb36640] mmco: unref short failure +[h264 @ 0x555decb36640] mmco: unref short failure +[h264 @ 0x55d955d53340] mmco: unref short failure +[h264 @ 0x55d955d53340] mmco: unref short failure + [2024-11-27 22:00:34] iteration 350/ 1000 | consumed samples: 22400 | elapsed time per iteration (ms): 154667.2 | throughput per GPU (TFLOP/s/GPU): 49.8 | learning rate: 3.797834E-06 | global batch size: 64 | lm loss: 7.512761E-01 | loss scale: 1.0 | grad norm: 0.927 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555dee700e80] mmco: unref short failure +[h264 @ 0x555dee700e80] mmco: unref short failure +[h264 @ 0x55d95c2eb200] mmco: unref short failure +[h264 @ 0x55d95c2eb200] mmco: unref short failure +[h264 @ 0x55d955d53340] mmco: unref short failure +[h264 @ 0x555decb36640] mmco: unref short failure + [2024-11-27 22:02:09] iteration 351/ 1000 | consumed samples: 22464 | elapsed time per iteration (ms): 95160.2 | throughput per GPU (TFLOP/s/GPU): 81.0 | learning rate: 3.790999E-06 | global batch size: 64 | lm loss: 7.407740E-01 | loss scale: 1.0 | grad norm: 0.972 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555dee588f00] mmco: unref short failure +[h264 @ 0x55d955d53340] mmco: unref short failure +[h264 @ 0x555dee588f00] mmco: unref short failure +[h264 @ 0x55d955d53340] mmco: unref short failure +[h264 @ 0x555dee700e80] mmco: unref short failure +[h264 @ 0x55d95c2eb200] mmco: unref short failure +[h264 @ 0x555dee700e80] mmco: unref short failure +[h264 @ 0x55d95c2eb200] mmco: unref short failure + [2024-11-27 22:03:16] iteration 352/ 1000 | consumed samples: 22528 | elapsed time per iteration (ms): 66347.4 | throughput per GPU (TFLOP/s/GPU): 116.2 | learning rate: 3.784151E-06 | global batch size: 64 | lm loss: 6.471483E-01 | loss scale: 1.0 | grad norm: 0.817 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555dee104680] mmco: unref short failure +[h264 @ 0x555dee104680] mmco: unref short failure +[h264 @ 0x555dee104680] mmco: unref short failure +[h264 @ 0x555dee104680] mmco: unref short failure +[h264 @ 0x555dee104680] mmco: unref short failure +[h264 @ 0x555dee588f00] mmco: unref short failure +[h264 @ 0x555dee588f00] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x55d955d53340] mmco: unref short failure +[h264 @ 0x55d955d53340] mmco: unref short failure +[h264 @ 0x555dee1d4480] mmco: unref short failure +[h264 @ 0x555dee104680] mmco: unref short failure +[h264 @ 0x55d95c76c1c0] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x555dee1d4480] mmco: unref short failure +[h264 @ 0x555dee1d4480] mmco: unref short failure +[h264 @ 0x555dee104680] mmco: unref short failure +[h264 @ 0x555dee104680] mmco: unref short failure +[h264 @ 0x55d95c76c1c0] mmco: unref short failure +[h264 @ 0x55d95c76c1c0] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x555dee1d4480] mmco: unref short failure +[h264 @ 0x555dee1d4480] mmco: unref short failure +[h264 @ 0x555dee1d4480] mmco: unref short failure +[h264 @ 0x55d95c76c1c0] mmco: unref short failure +[h264 @ 0x55d95c76c1c0] mmco: unref short failure +[h264 @ 0x55d95c76c1c0] mmco: unref short failure +[h264 @ 0x555dee1d4480] mmco: unref short failure +[h264 @ 0x555dee1d4480] mmco: unref short failure +[h264 @ 0x55d95c76c1c0] mmco: unref short failure +[h264 @ 0x55d95c76c1c0] mmco: unref short failure +[h264 @ 0x55d95c2eb200] mmco: unref short failure +[h264 @ 0x55d95c2eb200] mmco: unref short failure +[h264 @ 0x555dee700e80] mmco: unref short failure +[h264 @ 0x555dee700e80] mmco: unref short failure + [2024-11-27 22:05:04] iteration 353/ 1000 | consumed samples: 22592 | elapsed time per iteration (ms): 108513.0 | throughput per GPU (TFLOP/s/GPU): 71.0 | learning rate: 3.777290E-06 | global batch size: 64 | lm loss: 6.076833E-01 | loss scale: 1.0 | grad norm: 0.828 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555deebbd8c0] mmco: unref short failure +[h264 @ 0x555deebbd8c0] mmco: unref short failure +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x555deebbd8c0] mmco: unref short failure +[h264 @ 0x555deebbd8c0] mmco: unref short failure +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x55d95a292180] mmco: unref short failure +[h264 @ 0x55d95a292180] mmco: unref short failure +[h264 @ 0x55d955d4f640] mmco: unref short failure +[h264 @ 0x55d95a292180] mmco: unref short failure +[h264 @ 0x55d95a292180] mmco: unref short failure +[h264 @ 0x55d955d4f640] mmco: unref short failure +[h264 @ 0x55d955d4f640] mmco: unref short failure + [2024-11-27 22:06:37] iteration 354/ 1000 | consumed samples: 22656 | elapsed time per iteration (ms): 92546.4 | throughput per GPU (TFLOP/s/GPU): 83.3 | learning rate: 3.770416E-06 | global batch size: 64 | lm loss: 6.827998E-01 | loss scale: 1.0 | grad norm: 1.299 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555df32c1f40] mmco: unref short failure +[h264 @ 0x55d95c76c1c0] mmco: unref short failure +[h264 @ 0x555deebbd8c0] mmco: unref short failure +[h264 @ 0x555deebbd8c0] mmco: unref short failure +[h264 @ 0x55d95a292180] mmco: unref short failure +[h264 @ 0x55d95a292180] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x55d95c76c1c0] mmco: unref short failure +[h264 @ 0x55d95c76c1c0] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x55d95c76c1c0] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x55d95c76c1c0] mmco: unref short failure +[h264 @ 0x55d95c76c1c0] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x55d95c76c1c0] mmco: unref short failure +[h264 @ 0x55d95c76c1c0] mmco: unref short failure +[h264 @ 0x555deebbd8c0] mmco: unref short failure +[h264 @ 0x55d95a292180] mmco: unref short failure +[h264 @ 0x555deebbd8c0] mmco: unref short failure +[h264 @ 0x555deebbd8c0] mmco: unref short failure +[h264 @ 0x55d95a292180] mmco: unref short failure +[h264 @ 0x55d95a292180] mmco: unref short failure +[h264 @ 0x555deebbd8c0] mmco: unref short failure +[h264 @ 0x55d95a292180] mmco: unref short failure +[h264 @ 0x555deebbd8c0] mmco: unref short failure +[h264 @ 0x555deebbd8c0] mmco: unref short failure +[h264 @ 0x555deebbd8c0] mmco: unref short failure +[h264 @ 0x55d95a292180] mmco: unref short failure +[h264 @ 0x55d95a292180] mmco: unref short failure +[h264 @ 0x55d95a292180] mmco: unref short failure + [2024-11-27 22:08:07] iteration 355/ 1000 | consumed samples: 22720 | elapsed time per iteration (ms): 90414.0 | throughput per GPU (TFLOP/s/GPU): 85.3 | learning rate: 3.763529E-06 | global batch size: 64 | lm loss: 6.579286E-01 | loss scale: 1.0 | grad norm: 1.292 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x55d955d4f640] mmco: unref short failure +[h264 @ 0x55d955d4f640] mmco: unref short failure +[h264 @ 0x55d95707f900] mmco: unref short failure +[h264 @ 0x555df32c1f40] mmco: unref short failure +[h264 @ 0x55d95707f900] mmco: unref short failure +[h264 @ 0x555dee1d4480] mmco: unref short failure +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x55d955d4f640] mmco: unref short failure +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x55d955d4f640] mmco: unref short failure +[h264 @ 0x55d95707f900] mmco: unref short failure +[h264 @ 0x555df32c1f40] mmco: unref short failure +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x55d955d4f640] mmco: unref short failure +[h264 @ 0x55d955d4f640] mmco: unref short failure +[h264 @ 0x55d95707f900] mmco: unref short failure +[h264 @ 0x555df32c1f40] mmco: unref short failure +[h264 @ 0x55d95707f900] mmco: unref short failure +[h264 @ 0x55d95707f900] mmco: unref short failure +[h264 @ 0x555df32c1f40] mmco: unref short failure +[h264 @ 0x555df32c1f40] mmco: unref short failure +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x55d955d4f640] mmco: unref short failure +[h264 @ 0x55d955d4f640] mmco: unref short failure +[h264 @ 0x55d955d4f640] mmco: unref short failure +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x555df32c1f40] mmco: unref short failure +[h264 @ 0x55d95707f900] mmco: unref short failure +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x55d955d4f640] mmco: unref short failure +[h264 @ 0x555df32c1f40] mmco: unref short failure +[h264 @ 0x55d95707f900] mmco: unref short failure +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x55d955d4f640] mmco: unref short failure +[h264 @ 0x55d955d4f640] mmco: unref short failure + [2024-11-27 22:09:37] iteration 356/ 1000 | consumed samples: 22784 | elapsed time per iteration (ms): 89877.2 | throughput per GPU (TFLOP/s/GPU): 85.8 | learning rate: 3.756630E-06 | global batch size: 64 | lm loss: 7.134993E-01 | loss scale: 1.0 | grad norm: 0.920 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555df32c1f40] mmco: unref short failure +[h264 @ 0x555df32c1f40] mmco: unref short failure +[h264 @ 0x555df32c1f40] mmco: unref short failure +[h264 @ 0x555df32c1f40] mmco: unref short failure +[h264 @ 0x55d95707f900] mmco: unref short failure +[h264 @ 0x55d95707f900] mmco: unref short failure +[h264 @ 0x55d95707f900] mmco: unref short failure +[h264 @ 0x55d95707f900] mmco: unref short failure + [2024-11-27 22:11:35] iteration 357/ 1000 | consumed samples: 22848 | elapsed time per iteration (ms): 118027.1 | throughput per GPU (TFLOP/s/GPU): 65.3 | learning rate: 3.749717E-06 | global batch size: 64 | lm loss: 7.717846E-01 | loss scale: 1.0 | grad norm: 0.876 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x55d95707f900] mmco: unref short failure +[h264 @ 0x555df32c1f40] mmco: unref short failure +[h264 @ 0x55d95707f900] mmco: unref short failure +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x55d95707f900] mmco: unref short failure +[h264 @ 0x555dee1d4480] mmco: unref short failure + [2024-11-27 22:13:00] iteration 358/ 1000 | consumed samples: 22912 | elapsed time per iteration (ms): 85382.6 | throughput per GPU (TFLOP/s/GPU): 90.3 | learning rate: 3.742793E-06 | global batch size: 64 | lm loss: 6.735170E-01 | loss scale: 1.0 | grad norm: 0.716 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x55d95707f900] mmco: unref short failure +[h264 @ 0x55d95707f900] mmco: unref short failure +[h264 @ 0x555dee104680] mmco: unref short failure +[h264 @ 0x555dee104680] mmco: unref short failure +[h264 @ 0x55d95707f900] mmco: unref short failure +[h264 @ 0x55d95707f900] mmco: unref short failure +[h264 @ 0x555dee104680] mmco: unref short failure +[h264 @ 0x555dee104680] mmco: unref short failure +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure + [2024-11-27 22:14:27] iteration 359/ 1000 | consumed samples: 22976 | elapsed time per iteration (ms): 86401.7 | throughput per GPU (TFLOP/s/GPU): 89.2 | learning rate: 3.735855E-06 | global batch size: 64 | lm loss: 6.633616E-01 | loss scale: 1.0 | grad norm: 0.793 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555dee104680] mmco: unref short failure +[h264 @ 0x55d95873d100] mmco: unref short failure +[h264 @ 0x555df4961e80] mmco: unref short failure +[h264 @ 0x55d959cd93c0] mmco: unref short failure + [2024-11-27 22:17:18] iteration 360/ 1000 | consumed samples: 23040 | elapsed time per iteration (ms): 170996.0 | throughput per GPU (TFLOP/s/GPU): 45.1 | learning rate: 3.728906E-06 | global batch size: 64 | lm loss: 6.863374E-01 | loss scale: 1.0 | grad norm: 0.793 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x555deebbd8c0] mmco: unref short failure +[h264 @ 0x555deebbd8c0] mmco: unref short failure +[h264 @ 0x55d95a292180] mmco: unref short failure +[h264 @ 0x55d95a292180] mmco: unref short failure +[h264 @ 0x55d95a292180] mmco: unref short failure +[h264 @ 0x555deebbd8c0] mmco: unref short failure +[h264 @ 0x555dee1d4480] mmco: unref short failure +[h264 @ 0x55d956aae040] mmco: unref short failure +[h264 @ 0x555dee1d4480] mmco: unref short failure +[h264 @ 0x555dee1d4480] mmco: unref short failure +[h264 @ 0x55d956aae040] mmco: unref short failure +[h264 @ 0x55d956aae040] mmco: unref short failure +[h264 @ 0x555dee1d4480] mmco: unref short failure +[h264 @ 0x555dee1d4480] mmco: unref short failure +[h264 @ 0x55d956aae040] mmco: unref short failure +[h264 @ 0x55d956aae040] mmco: unref short failure +[h264 @ 0x555dee104680] mmco: unref short failure +[h264 @ 0x55d956b88f40] mmco: unref short failure +[h264 @ 0x555dee1d4480] mmco: unref short failure +[h264 @ 0x555dee1d4480] mmco: unref short failure +[h264 @ 0x55d956aae040] mmco: unref short failure +[h264 @ 0x55d956aae040] mmco: unref short failure +[h264 @ 0x555dee1d4480] mmco: unref short failure +[h264 @ 0x555dee1d4480] mmco: unref short failure +[h264 @ 0x55d956aae040] mmco: unref short failure +[h264 @ 0x55d956aae040] mmco: unref short failure +[h264 @ 0x555dee1d4480] mmco: unref short failure +[h264 @ 0x555dee1d4480] mmco: unref short failure +[h264 @ 0x55d956aae040] mmco: unref short failure +[h264 @ 0x55d956aae040] mmco: unref short failure +[h264 @ 0x555dee1d4480] mmco: unref short failure +[h264 @ 0x555dee1d4480] mmco: unref short failure +[h264 @ 0x55d956aae040] mmco: unref short failure +[h264 @ 0x555deebbd8c0] mmco: unref short failure +[h264 @ 0x55d956aae040] mmco: unref short failure +[h264 @ 0x55d95a292180] mmco: unref short failure + [2024-11-27 22:18:46] iteration 361/ 1000 | consumed samples: 23104 | elapsed time per iteration (ms): 88048.3 | throughput per GPU (TFLOP/s/GPU): 87.5 | learning rate: 3.721943E-06 | global batch size: 64 | lm loss: 7.136717E-01 | loss scale: 1.0 | grad norm: 1.186 | number of skipped iterations: 0 | number of nan iterations: 0 | + [2024-11-27 22:20:03] iteration 362/ 1000 | consumed samples: 23168 | elapsed time per iteration (ms): 76992.2 | throughput per GPU (TFLOP/s/GPU): 100.1 | learning rate: 3.714969E-06 | global batch size: 64 | lm loss: 6.679822E-01 | loss scale: 1.0 | grad norm: 0.699 | number of skipped iterations: 0 | number of nan iterations: 0 | + [2024-11-27 22:21:44] iteration 363/ 1000 | consumed samples: 23232 | elapsed time per iteration (ms): 101637.7 | throughput per GPU (TFLOP/s/GPU): 75.8 | learning rate: 3.707982E-06 | global batch size: 64 | lm loss: 6.850898E-01 | loss scale: 1.0 | grad norm: 1.204 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555dec6b6640] mmco: unref short failure +[h264 @ 0x55d955e4acc0] mmco: unref short failure +[h264 @ 0x555dec6b6640] mmco: unref short failure +[h264 @ 0x555dec6b6640] mmco: unref short failure +[h264 @ 0x55d955e4acc0] mmco: unref short failure +[h264 @ 0x55d955e4acc0] mmco: unref short failure + [2024-11-27 22:23:38] iteration 364/ 1000 | consumed samples: 23296 | elapsed time per iteration (ms): 113918.9 | throughput per GPU (TFLOP/s/GPU): 67.7 | learning rate: 3.700984E-06 | global batch size: 64 | lm loss: 6.541436E-01 | loss scale: 1.0 | grad norm: 1.360 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x55d95707f900] mmco: unref short failure +[h264 @ 0x55d95707f900] mmco: unref short failure +[h264 @ 0x555df32c1f40] mmco: unref short failure +[h264 @ 0x555df32c1f40] mmco: unref short failure +[h264 @ 0x555df368a780] mmco: unref short failure +[h264 @ 0x555df368a780] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure + [2024-11-27 22:25:22] iteration 365/ 1000 | consumed samples: 23360 | elapsed time per iteration (ms): 103746.9 | throughput per GPU (TFLOP/s/GPU): 74.3 | learning rate: 3.693973E-06 | global batch size: 64 | lm loss: 6.338899E-01 | loss scale: 1.0 | grad norm: 0.910 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555dee104680] mmco: unref short failure +[h264 @ 0x555dee104680] mmco: unref short failure +[h264 @ 0x55d956b88f40] mmco: unref short failure +[h264 @ 0x55d956b88f40] mmco: unref short failure +[h264 @ 0x55d956f36840] mmco: unref short failure +[h264 @ 0x55d956f36840] mmco: unref short failure +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x555dec6b6640] mmco: unref short failure +[h264 @ 0x555dec6b6640] mmco: unref short failure +[h264 @ 0x55d955e4acc0] mmco: unref short failure +[h264 @ 0x55d955e4acc0] mmco: unref short failure +[h264 @ 0x555dee1d4480] mmco: unref short failure +[h264 @ 0x55d956aae040] mmco: unref short failure +[h264 @ 0x555dee1d4480] mmco: unref short failure +[h264 @ 0x555dee1d4480] mmco: unref short failure +[h264 @ 0x55d956aae040] mmco: unref short failure +[h264 @ 0x55d956aae040] mmco: unref short failure + [2024-11-27 22:26:38] iteration 366/ 1000 | consumed samples: 23424 | elapsed time per iteration (ms): 75846.3 | throughput per GPU (TFLOP/s/GPU): 101.6 | learning rate: 3.686950E-06 | global batch size: 64 | lm loss: 6.501561E-01 | loss scale: 1.0 | grad norm: 0.830 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555dee104680] mmco: unref short failure +[h264 @ 0x55d956b88f40] mmco: unref short failure +[h264 @ 0x55d955e4acc0] mmco: unref short failure +[h264 @ 0x555dec6b6640] mmco: unref short failure +[h264 @ 0x555df4961e80] mmco: unref short failure +[h264 @ 0x555df4961e80] mmco: unref short failure +[h264 @ 0x55d959cd93c0] mmco: unref short failure +[h264 @ 0x55d959cd93c0] mmco: unref short failure +[h264 @ 0x555dec6b6640] mmco: unref short failure +[h264 @ 0x555dec6b6640] mmco: unref short failure +[h264 @ 0x55d955e4acc0] mmco: unref short failure +[h264 @ 0x55d955e4acc0] mmco: unref short failure +[h264 @ 0x555dec6b6640] mmco: unref short failure +[h264 @ 0x55d955e4acc0] mmco: unref short failure +[h264 @ 0x555def216fc0] mmco: unref short failure +[h264 @ 0x555def216fc0] mmco: unref short failure +[h264 @ 0x55d956d792c0] mmco: unref short failure +[h264 @ 0x55d956d792c0] mmco: unref short failure + [2024-11-27 22:27:59] iteration 367/ 1000 | consumed samples: 23488 | elapsed time per iteration (ms): 80842.6 | throughput per GPU (TFLOP/s/GPU): 95.4 | learning rate: 3.679915E-06 | global batch size: 64 | lm loss: 6.532076E-01 | loss scale: 1.0 | grad norm: 0.893 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x55d956f36840] mmco: unref short failure +[h264 @ 0x555dec9973c0] mmco: unref short failure + [2024-11-27 22:29:14] iteration 368/ 1000 | consumed samples: 23552 | elapsed time per iteration (ms): 75294.2 | throughput per GPU (TFLOP/s/GPU): 102.4 | learning rate: 3.672869E-06 | global batch size: 64 | lm loss: 7.082227E-01 | loss scale: 1.0 | grad norm: 0.920 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555def216fc0] mmco: unref short failure +[h264 @ 0x55d956f78600] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x555ded679600] mmco: unref short failure +[h264 @ 0x555ded679600] mmco: unref short failure +[h264 @ 0x55d956b88f40] mmco: unref short failure +[h264 @ 0x55d956b88f40] mmco: unref short failure +[h264 @ 0x555dee104680] mmco: unref short failure +[h264 @ 0x555dee104680] mmco: unref short failure +[h264 @ 0x555ded679600] mmco: unref short failure +[h264 @ 0x555ded679600] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x55d956b88f40] mmco: unref short failure +[h264 @ 0x55d956b88f40] mmco: unref short failure +[h264 @ 0x555dee104680] mmco: unref short failure +[h264 @ 0x555dee104680] mmco: unref short failure +[h264 @ 0x55d956b88f40] mmco: unref short failure +[h264 @ 0x555dee104680] mmco: unref short failure +[h264 @ 0x55d956b88f40] mmco: unref short failure +[h264 @ 0x55d956b88f40] mmco: unref short failure +[h264 @ 0x555dee104680] mmco: unref short failure +[h264 @ 0x555dee104680] mmco: unref short failure +[h264 @ 0x555df4961e80] mmco: unref short failure +[h264 @ 0x555df4961e80] mmco: unref short failure +[h264 @ 0x55d959cd93c0] mmco: unref short failure +[h264 @ 0x55d959cd93c0] mmco: unref short failure +[h264 @ 0x555df4961e80] mmco: unref short failure +[h264 @ 0x555df4961e80] mmco: unref short failure +[h264 @ 0x555df4961e80] mmco: unref short failure +[h264 @ 0x55d959cd93c0] mmco: unref short failure +[h264 @ 0x55d959cd93c0] mmco: unref short failure +[h264 @ 0x55d959cd93c0] mmco: unref short failure +[h264 @ 0x555df4961e80] mmco: unref short failure +[h264 @ 0x55d959cd93c0] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x555dec6b6640] mmco: unref short failure +[h264 @ 0x555dec6b6640] mmco: unref short failure +[h264 @ 0x555dec6b6640] mmco: unref short failure +[h264 @ 0x555dec6b6640] mmco: unref short failure +[h264 @ 0x55d955e4acc0] mmco: unref short failure +[h264 @ 0x55d955e4acc0] mmco: unref short failure +[h264 @ 0x55d955e4acc0] mmco: unref short failure +[h264 @ 0x55d955e4acc0] mmco: unref short failure +[h264 @ 0x555dec6b6640] mmco: unref short failure +[h264 @ 0x55d955e4acc0] mmco: unref short failure + [2024-11-27 22:31:07] iteration 369/ 1000 | consumed samples: 23616 | elapsed time per iteration (ms): 113118.2 | throughput per GPU (TFLOP/s/GPU): 68.1 | learning rate: 3.665810E-06 | global batch size: 64 | lm loss: 6.782001E-01 | loss scale: 1.0 | grad norm: 0.842 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x555ded679600] mmco: unref short failure +[h264 @ 0x555ded679600] mmco: unref short failure +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x55d956f36840] mmco: unref short failure +[h264 @ 0x55d956f36840] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x555ded679600] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x555ded679600] mmco: unref short failure +[h264 @ 0x55d956f36840] mmco: unref short failure +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x55d956f36840] mmco: unref short failure +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x55d956f36840] mmco: unref short failure +[h264 @ 0x55d956f36840] mmco: unref short failure +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x555ded679600] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure + [2024-11-27 22:32:41] iteration 370/ 1000 | consumed samples: 23680 | elapsed time per iteration (ms): 93459.1 | throughput per GPU (TFLOP/s/GPU): 82.5 | learning rate: 3.658740E-06 | global batch size: 64 | lm loss: 6.857321E-01 | loss scale: 1.0 | grad norm: 0.755 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x55d955b8bcc0] mmco: unref short failure +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x55d955b8bcc0] mmco: unref short failure +[h264 @ 0x55d955b8bcc0] mmco: unref short failure +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x55d955b8bcc0] mmco: unref short failure +[h264 @ 0x55d955b8bcc0] mmco: unref short failure +[h264 @ 0x55d955b8bcc0] mmco: unref short failure +[h264 @ 0x55d955b8bcc0] mmco: unref short failure +[h264 @ 0x55d955b8bcc0] mmco: unref short failure +[h264 @ 0x55d955b8bcc0] mmco: unref short failure +[h264 @ 0x555dec6b6640] mmco: unref short failure +[h264 @ 0x555dec6b6640] mmco: unref short failure +[h264 @ 0x55d955e4acc0] mmco: unref short failure +[h264 @ 0x55d955e4acc0] mmco: unref short failure +[h264 @ 0x555dec6b6640] mmco: unref short failure +[h264 @ 0x555dec6b6640] mmco: unref short failure +[h264 @ 0x55d955e4acc0] mmco: unref short failure +[h264 @ 0x55d955e4acc0] mmco: unref short failure +[h264 @ 0x555dec6b6640] mmco: unref short failure +[h264 @ 0x55d955e4acc0] mmco: unref short failure +[h264 @ 0x555dec6b6640] mmco: unref short failure +[h264 @ 0x555dec6b6640] mmco: unref short failure +[h264 @ 0x55d955e4acc0] mmco: unref short failure +[h264 @ 0x55d955e4acc0] mmco: unref short failure + [2024-11-27 22:33:58] iteration 371/ 1000 | consumed samples: 23744 | elapsed time per iteration (ms): 77477.9 | throughput per GPU (TFLOP/s/GPU): 99.5 | learning rate: 3.651659E-06 | global batch size: 64 | lm loss: 6.332526E-01 | loss scale: 1.0 | grad norm: 0.824 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x55d956f36840] mmco: unref short failure +[h264 @ 0x55d956f36840] mmco: unref short failure +[h264 @ 0x555ded679600] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x555ded70ae00] mmco: unref short failure +[h264 @ 0x555ded70ae00] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x555ded70ae00] mmco: unref short failure +[h264 @ 0x555ded70ae00] mmco: unref short failure +[h264 @ 0x555dedbd1740] mmco: unref short failure +[h264 @ 0x555dedbd1740] mmco: unref short failure +[h264 @ 0x555ded70ae00] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x55d95707f900] mmco: unref short failure +[h264 @ 0x55d95707f900] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x555dedbd1740] mmco: unref short failure +[h264 @ 0x555dedbd1740] mmco: unref short failure +[h264 @ 0x55d95707f900] mmco: unref short failure +[h264 @ 0x55d95707f900] mmco: unref short failure +[h264 @ 0x555dec6b6640] mmco: unref short failure +[h264 @ 0x555dec6b6640] mmco: unref short failure +[h264 @ 0x55d955e4acc0] mmco: unref short failure +[h264 @ 0x55d955e4acc0] mmco: unref short failure +[h264 @ 0x55d955b8bcc0] mmco: unref short failure +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x555dee792700] mmco: unref short failure +[h264 @ 0x555dee792700] mmco: unref short failure +[h264 @ 0x55d956f78600] mmco: unref short failure +[h264 @ 0x55d956f78600] mmco: unref short failure +[h264 @ 0x555dee792700] mmco: unref short failure +[h264 @ 0x55d956f78600] mmco: unref short failure +[h264 @ 0x555dee792700] mmco: unref short failure +[h264 @ 0x555dee792700] mmco: unref short failure +[h264 @ 0x55d956f78600] mmco: unref short failure +[h264 @ 0x55d956f78600] mmco: unref short failure +[h264 @ 0x555dee792700] mmco: unref short failure +[h264 @ 0x555dee792700] mmco: unref short failure +[h264 @ 0x55d956f78600] mmco: unref short failure +[h264 @ 0x55d956f78600] mmco: unref short failure +[h264 @ 0x555dee792700] mmco: unref short failure +[h264 @ 0x55d956f78600] mmco: unref short failure +[h264 @ 0x555dee792700] mmco: unref short failure +[h264 @ 0x55d956f78600] mmco: unref short failure +[h264 @ 0x555dee792700] mmco: unref short failure +[h264 @ 0x55d956f78600] mmco: unref short failure + [2024-11-27 22:36:11] iteration 372/ 1000 | consumed samples: 23808 | elapsed time per iteration (ms): 133201.3 | throughput per GPU (TFLOP/s/GPU): 57.9 | learning rate: 3.644565E-06 | global batch size: 64 | lm loss: 6.694164E-01 | loss scale: 1.0 | grad norm: 0.886 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x555ded679600] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure + [2024-11-27 22:37:24] iteration 373/ 1000 | consumed samples: 23872 | elapsed time per iteration (ms): 72420.9 | throughput per GPU (TFLOP/s/GPU): 106.4 | learning rate: 3.637460E-06 | global batch size: 64 | lm loss: 6.407461E-01 | loss scale: 1.0 | grad norm: 0.825 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x55d955b8bcc0] mmco: unref short failure +[h264 @ 0x55d955b8bcc0] mmco: unref short failure + [2024-11-27 22:38:37] iteration 374/ 1000 | consumed samples: 23936 | elapsed time per iteration (ms): 72701.7 | throughput per GPU (TFLOP/s/GPU): 106.0 | learning rate: 3.630344E-06 | global batch size: 64 | lm loss: 6.712391E-01 | loss scale: 1.0 | grad norm: 0.836 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555dee6b8180] mmco: unref short failure +[h264 @ 0x55d95707f900] mmco: unref short failure +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x55d955b8bcc0] mmco: unref short failure +[h264 @ 0x555dedbd1740] mmco: unref short failure +[h264 @ 0x555dedbd1740] mmco: unref short failure +[h264 @ 0x55d95707f900] mmco: unref short failure +[h264 @ 0x55d95707f900] mmco: unref short failure + [2024-11-27 22:40:02] iteration 375/ 1000 | consumed samples: 24000 | elapsed time per iteration (ms): 85487.1 | throughput per GPU (TFLOP/s/GPU): 90.2 | learning rate: 3.623217E-06 | global batch size: 64 | lm loss: 6.910558E-01 | loss scale: 1.0 | grad norm: 0.892 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x555def216fc0] mmco: unref short failure +[h264 @ 0x55d956f78600] mmco: unref short failure +[h264 @ 0x555def216fc0] mmco: unref short failure +[h264 @ 0x555def216fc0] mmco: unref short failure +[h264 @ 0x55d956f78600] mmco: unref short failure +[h264 @ 0x55d956f78600] mmco: unref short failure + [2024-11-27 22:41:12] iteration 376/ 1000 | consumed samples: 24064 | elapsed time per iteration (ms): 69634.9 | throughput per GPU (TFLOP/s/GPU): 110.7 | learning rate: 3.616078E-06 | global batch size: 64 | lm loss: 8.254275E-01 | loss scale: 1.0 | grad norm: 1.206 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555def216fc0] mmco: unref short failure +[h264 @ 0x55d956f78600] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x55d957fbfd40] mmco: unref short failure +[h264 @ 0x555dee104680] mmco: unref short failure +[h264 @ 0x555dee104680] mmco: unref short failure +[h264 @ 0x55d95a292180] mmco: unref short failure +[h264 @ 0x55d95a292180] mmco: unref short failure +[h264 @ 0x55d957fbfd40] mmco: unref short failure +[h264 @ 0x55d957fbfd40] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x55d957fbfd40] mmco: unref short failure +[h264 @ 0x55d957fbfd40] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x55d957fbfd40] mmco: unref short failure +[h264 @ 0x55d957fbfd40] mmco: unref short failure +[h264 @ 0x55d957fbfd40] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x55d957fbfd40] mmco: unref short failure + [2024-11-27 22:42:21] iteration 377/ 1000 | consumed samples: 24128 | elapsed time per iteration (ms): 69132.3 | throughput per GPU (TFLOP/s/GPU): 111.5 | learning rate: 3.608928E-06 | global batch size: 64 | lm loss: 7.217027E-01 | loss scale: 1.0 | grad norm: 0.838 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555dedbd1740] mmco: unref short failure +[h264 @ 0x55d9593709c0] mmco: unref short failure +[h264 @ 0x55d95707f900] mmco: unref short failure +[h264 @ 0x555dedbd1740] mmco: unref short failure +[h264 @ 0x55d95707f900] mmco: unref short failure +[h264 @ 0x55d95707f900] mmco: unref short failure +[h264 @ 0x555dedbd1740] mmco: unref short failure +[h264 @ 0x555dedbd1740] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure + [2024-11-27 22:44:28] iteration 378/ 1000 | consumed samples: 24192 | elapsed time per iteration (ms): 127310.0 | throughput per GPU (TFLOP/s/GPU): 60.5 | learning rate: 3.601767E-06 | global batch size: 64 | lm loss: 6.989653E-01 | loss scale: 1.0 | grad norm: 0.902 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x55d9593709c0] mmco: unref short failure +[h264 @ 0x55d9593709c0] mmco: unref short failure +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x55d9593709c0] mmco: unref short failure +[h264 @ 0x55d9593709c0] mmco: unref short failure +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x555dec9973c0] mmco: unref short failure + [2024-11-27 22:45:37] iteration 379/ 1000 | consumed samples: 24256 | elapsed time per iteration (ms): 68915.3 | throughput per GPU (TFLOP/s/GPU): 111.9 | learning rate: 3.594595E-06 | global batch size: 64 | lm loss: 7.196112E-01 | loss scale: 1.0 | grad norm: 0.894 | number of skipped iterations: 0 | number of nan iterations: 0 | + [2024-11-27 22:47:03] iteration 380/ 1000 | consumed samples: 24320 | elapsed time per iteration (ms): 85554.5 | throughput per GPU (TFLOP/s/GPU): 90.1 | learning rate: 3.587412E-06 | global batch size: 64 | lm loss: 7.016000E-01 | loss scale: 1.0 | grad norm: 0.764 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555def216fc0] mmco: unref short failure +[h264 @ 0x555def216fc0] mmco: unref short failure +[h264 @ 0x55d956f78600] mmco: unref short failure +[h264 @ 0x55d956f78600] mmco: unref short failure +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x555dedbd1740] mmco: unref short failure +[h264 @ 0x555def216fc0] mmco: unref short failure +[h264 @ 0x555def216fc0] mmco: unref short failure +[h264 @ 0x555def216fc0] mmco: unref short failure +[h264 @ 0x555def216fc0] mmco: unref short failure +[h264 @ 0x55d956d792c0] mmco: unref short failure +[h264 @ 0x555dedbd1740] mmco: unref short failure +[h264 @ 0x55d95707f900] mmco: unref short failure +[h264 @ 0x55d956f78600] mmco: unref short failure +[h264 @ 0x55d956f78600] mmco: unref short failure +[h264 @ 0x55d956f78600] mmco: unref short failure +[h264 @ 0x55d956f78600] mmco: unref short failure +[h264 @ 0x55d95707f900] mmco: unref short failure +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x55d956d792c0] mmco: unref short failure +[h264 @ 0x55d956d792c0] mmco: unref short failure +[h264 @ 0x55d955b8bcc0] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x55d955b8bcc0] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x55d95707f900] mmco: unref short failure +[h264 @ 0x55d95707f900] mmco: unref short failure +[h264 @ 0x555dedf05880] mmco: unref short failure +[h264 @ 0x555dedf05880] mmco: unref short failure +[h264 @ 0x55d95707f900] mmco: unref short failure +[h264 @ 0x55d95707f900] mmco: unref short failure +[h264 @ 0x555dedf05880] mmco: unref short failure +[h264 @ 0x555dedf05880] mmco: unref short failure +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x55d956f98f80] mmco: unref short failure + [2024-11-27 22:48:49] iteration 381/ 1000 | consumed samples: 24384 | elapsed time per iteration (ms): 106037.3 | throughput per GPU (TFLOP/s/GPU): 72.7 | learning rate: 3.580218E-06 | global batch size: 64 | lm loss: 7.071315E-01 | loss scale: 1.0 | grad norm: 0.844 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x55d955b8bcc0] mmco: unref short failure + [2024-11-27 22:50:23] iteration 382/ 1000 | consumed samples: 24448 | elapsed time per iteration (ms): 93817.4 | throughput per GPU (TFLOP/s/GPU): 82.2 | learning rate: 3.573013E-06 | global batch size: 64 | lm loss: 6.622055E-01 | loss scale: 1.0 | grad norm: 1.015 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x555def216fc0] mmco: unref short failure +[h264 @ 0x55d956f78600] mmco: unref short failure + [2024-11-27 22:51:47] iteration 383/ 1000 | consumed samples: 24512 | elapsed time per iteration (ms): 84171.7 | throughput per GPU (TFLOP/s/GPU): 91.6 | learning rate: 3.565798E-06 | global batch size: 64 | lm loss: 6.726893E-01 | loss scale: 1.0 | grad norm: 0.901 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555def216fc0] mmco: unref short failure +[h264 @ 0x55d956f78600] mmco: unref short failure +[h264 @ 0x555def216fc0] mmco: unref short failure +[h264 @ 0x555def216fc0] mmco: unref short failure +[h264 @ 0x55d956f78600] mmco: unref short failure +[h264 @ 0x55d956f78600] mmco: unref short failure +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x55d956f98f80] mmco: unref short failure +[h264 @ 0x55d956f98f80] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x55d959b34680] mmco: unref short failure +[h264 @ 0x55d959b34680] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x55d959b34680] mmco: unref short failure +[h264 @ 0x55d959b34680] mmco: unref short failure + [2024-11-27 22:53:28] iteration 384/ 1000 | consumed samples: 24576 | elapsed time per iteration (ms): 101481.2 | throughput per GPU (TFLOP/s/GPU): 76.0 | learning rate: 3.558572E-06 | global batch size: 64 | lm loss: 6.746169E-01 | loss scale: 1.0 | grad norm: 0.820 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x55d956f78600] mmco: unref short failure +[h264 @ 0x55d956f78600] mmco: unref short failure +[h264 @ 0x555def216fc0] mmco: unref short failure +[h264 @ 0x555def216fc0] mmco: unref short failure +[h264 @ 0x555def216fc0] mmco: unref short failure +[h264 @ 0x55d956f78600] mmco: unref short failure +[h264 @ 0x555def216fc0] mmco: unref short failure +[h264 @ 0x555def216fc0] mmco: unref short failure +[h264 @ 0x55d956f78600] mmco: unref short failure +[h264 @ 0x55d956f78600] mmco: unref short failure +[h264 @ 0x55d955b8bcc0] mmco: unref short failure +[h264 @ 0x55d955b8bcc0] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x555def216fc0] mmco: unref short failure +[h264 @ 0x55d956f78600] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x55d9575fd4c0] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure + [2024-11-27 22:54:49] iteration 385/ 1000 | consumed samples: 24640 | elapsed time per iteration (ms): 80548.4 | throughput per GPU (TFLOP/s/GPU): 95.7 | learning rate: 3.551335E-06 | global batch size: 64 | lm loss: 7.008195E-01 | loss scale: 1.0 | grad norm: 0.864 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555def216fc0] mmco: unref short failure +[h264 @ 0x555def216fc0] mmco: unref short failure +[h264 @ 0x55d956f78600] mmco: unref short failure +[h264 @ 0x55d956f78600] mmco: unref short failure +[h264 @ 0x555def216fc0] mmco: unref short failure +[h264 @ 0x55d956f78600] mmco: unref short failure + [2024-11-27 22:56:18] iteration 386/ 1000 | consumed samples: 24704 | elapsed time per iteration (ms): 89194.3 | throughput per GPU (TFLOP/s/GPU): 86.4 | learning rate: 3.544088E-06 | global batch size: 64 | lm loss: 6.912686E-01 | loss scale: 1.0 | grad norm: 1.071 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x55d956f98f80] mmco: unref short failure +[h264 @ 0x55d956f98f80] mmco: unref short failure +[h264 @ 0x555dec6b6640] mmco: unref short failure +[h264 @ 0x555dec6b6640] mmco: unref short failure + [2024-11-27 22:57:44] iteration 387/ 1000 | consumed samples: 24768 | elapsed time per iteration (ms): 86242.0 | throughput per GPU (TFLOP/s/GPU): 89.4 | learning rate: 3.536830E-06 | global batch size: 64 | lm loss: 6.442069E-01 | loss scale: 1.0 | grad norm: 0.772 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x555dee6b8180] mmco: unref short failure +[h264 @ 0x55d95707f900] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x55d955b8bcc0] mmco: unref short failure +[h264 @ 0x555dee6ec240] mmco: unref short failure +[h264 @ 0x555dee6ec240] mmco: unref short failure +[h264 @ 0x55d956f98f80] mmco: unref short failure +[h264 @ 0x55d956f98f80] mmco: unref short failure + [2024-11-27 22:59:04] iteration 388/ 1000 | consumed samples: 24832 | elapsed time per iteration (ms): 80049.2 | throughput per GPU (TFLOP/s/GPU): 96.3 | learning rate: 3.529562E-06 | global batch size: 64 | lm loss: 6.354716E-01 | loss scale: 1.0 | grad norm: 0.705 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555dec6b6640] mmco: unref short failure +[h264 @ 0x555dec6b6640] mmco: unref short failure +[h264 @ 0x55d956f98f80] mmco: unref short failure +[h264 @ 0x55d956f98f80] mmco: unref short failure +[h264 @ 0x555dee6b8180] mmco: unref short failure +[h264 @ 0x555dee6b8180] mmco: unref short failure +[h264 @ 0x55d95707f900] mmco: unref short failure +[h264 @ 0x55d95707f900] mmco: unref short failure +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x555df0f7cd00] mmco: unref short failure +[h264 @ 0x555df0f7cd00] mmco: unref short failure +[h264 @ 0x555df0f7cd00] mmco: unref short failure +[h264 @ 0x55d956f98f80] mmco: unref short failure +[h264 @ 0x55d956f98f80] mmco: unref short failure +[h264 @ 0x55d959b34680] mmco: unref short failure +[h264 @ 0x55d959b34680] mmco: unref short failure +[h264 @ 0x55d959b34680] mmco: unref short failure +[h264 @ 0x555df0f7cd00] mmco: unref short failure +[h264 @ 0x55d959b34680] mmco: unref short failure +[h264 @ 0x555df0f7cd00] mmco: unref short failure +[h264 @ 0x555df0f7cd00] mmco: unref short failure +[h264 @ 0x55d959b34680] mmco: unref short failure +[h264 @ 0x55d959b34680] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure + [2024-11-27 23:00:24] iteration 389/ 1000 | consumed samples: 24896 | elapsed time per iteration (ms): 80050.4 | throughput per GPU (TFLOP/s/GPU): 96.3 | learning rate: 3.522284E-06 | global batch size: 64 | lm loss: 6.333457E-01 | loss scale: 1.0 | grad norm: 0.743 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555dec6b6640] mmco: unref short failure +[h264 @ 0x555dec6b6640] mmco: unref short failure +[h264 @ 0x55d956f98f80] mmco: unref short failure +[h264 @ 0x55d956f98f80] mmco: unref short failure +[h264 @ 0x55d956f98f80] mmco: unref short failure +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x555dec6b6640] mmco: unref short failure +[h264 @ 0x555dec6b6640] mmco: unref short failure +[h264 @ 0x555dec6b6640] mmco: unref short failure +[h264 @ 0x555dec6b6640] mmco: unref short failure +[h264 @ 0x55d956f98f80] mmco: unref short failure +[h264 @ 0x55d956f98f80] mmco: unref short failure +[h264 @ 0x55d956f98f80] mmco: unref short failure +[h264 @ 0x55d956f98f80] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure + [2024-11-27 23:01:53] iteration 390/ 1000 | consumed samples: 24960 | elapsed time per iteration (ms): 88041.1 | throughput per GPU (TFLOP/s/GPU): 87.6 | learning rate: 3.514996E-06 | global batch size: 64 | lm loss: 7.811137E-01 | loss scale: 1.0 | grad norm: 0.816 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x55d95707f900] mmco: unref short failure +[h264 @ 0x55d95707f900] mmco: unref short failure +[h264 @ 0x555dee6b8180] mmco: unref short failure +[h264 @ 0x555dee6b8180] mmco: unref short failure + [2024-11-27 23:03:19] iteration 391/ 1000 | consumed samples: 25024 | elapsed time per iteration (ms): 86427.6 | throughput per GPU (TFLOP/s/GPU): 89.2 | learning rate: 3.507697E-06 | global batch size: 64 | lm loss: 6.654625E-01 | loss scale: 1.0 | grad norm: 0.799 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555def216fc0] mmco: unref short failure +[h264 @ 0x55d956f78600] mmco: unref short failure +[h264 @ 0x555def216fc0] mmco: unref short failure +[h264 @ 0x55d956f78600] mmco: unref short failure +[h264 @ 0x55d957f0f640] mmco: unref short failure +[h264 @ 0x55d957f0f640] mmco: unref short failure +[h264 @ 0x555dec5a5140] mmco: unref short failure +[h264 @ 0x555dec5a5140] mmco: unref short failure +[h264 @ 0x555dec5a5140] mmco: unref short failure +[h264 @ 0x555dec5a5140] mmco: unref short failure +[h264 @ 0x55d957f0f640] mmco: unref short failure +[h264 @ 0x55d957f0f640] mmco: unref short failure +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x55d956f98f80] mmco: unref short failure +[h264 @ 0x55d956f98f80] mmco: unref short failure +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x55d956f98f80] mmco: unref short failure +[h264 @ 0x555dec6b6640] mmco: unref short failure +[h264 @ 0x55d956f98f80] mmco: unref short failure +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x55d956f98f80] mmco: unref short failure + [2024-11-27 23:04:43] iteration 392/ 1000 | consumed samples: 25088 | elapsed time per iteration (ms): 84323.1 | throughput per GPU (TFLOP/s/GPU): 91.4 | learning rate: 3.500388E-06 | global batch size: 64 | lm loss: 7.324268E-01 | loss scale: 1.0 | grad norm: 0.826 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure + [2024-11-27 23:06:08] iteration 393/ 1000 | consumed samples: 25152 | elapsed time per iteration (ms): 84520.0 | throughput per GPU (TFLOP/s/GPU): 91.2 | learning rate: 3.493070E-06 | global batch size: 64 | lm loss: 7.075290E-01 | loss scale: 1.0 | grad norm: 0.913 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x55d959cd93c0] mmco: unref short failure +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x55d959cd93c0] mmco: unref short failure +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x55d959cd93c0] mmco: unref short failure +[h264 @ 0x55d959cd93c0] mmco: unref short failure +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x55d959cd93c0] mmco: unref short failure +[h264 @ 0x55d959cd93c0] mmco: unref short failure + [2024-11-27 23:08:06] iteration 394/ 1000 | consumed samples: 25216 | elapsed time per iteration (ms): 118235.3 | throughput per GPU (TFLOP/s/GPU): 65.2 | learning rate: 3.485741E-06 | global batch size: 64 | lm loss: 7.145350E-01 | loss scale: 1.0 | grad norm: 0.845 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555dec6b6640] mmco: unref short failure +[h264 @ 0x55d956f98f80] mmco: unref short failure +[h264 @ 0x55d956f98f80] mmco: unref short failure +[h264 @ 0x555dec6b6640] mmco: unref short failure +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x55d959cd93c0] mmco: unref short failure +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x55d959cd93c0] mmco: unref short failure +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x55d959cd93c0] mmco: unref short failure +[h264 @ 0x55d959cd93c0] mmco: unref short failure +[h264 @ 0x555df4169bc0] mmco: unref short failure + [2024-11-27 23:09:20] iteration 395/ 1000 | consumed samples: 25280 | elapsed time per iteration (ms): 73522.1 | throughput per GPU (TFLOP/s/GPU): 104.8 | learning rate: 3.478403E-06 | global batch size: 64 | lm loss: 6.982417E-01 | loss scale: 1.0 | grad norm: 0.774 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x55d956767080] mmco: unref short failure +[h264 @ 0x555ded7e1040] mmco: unref short failure +[h264 @ 0x555ded7e1040] mmco: unref short failure +[h264 @ 0x555ded7e1040] mmco: unref short failure +[h264 @ 0x55d956767080] mmco: unref short failure +[h264 @ 0x55d956767080] mmco: unref short failure +[h264 @ 0x55d956767080] mmco: unref short failure +[h264 @ 0x55d956767080] mmco: unref short failure +[h264 @ 0x555ded7e1040] mmco: unref short failure +[h264 @ 0x555ded7e1040] mmco: unref short failure +[h264 @ 0x55d956767080] mmco: unref short failure +[h264 @ 0x55d956767080] mmco: unref short failure +[h264 @ 0x555ded7e1040] mmco: unref short failure +[h264 @ 0x555ded7e1040] mmco: unref short failure +[h264 @ 0x55d956767080] mmco: unref short failure +[h264 @ 0x55d956767080] mmco: unref short failure +[h264 @ 0x555ded7e1040] mmco: unref short failure +[h264 @ 0x555ded7e1040] mmco: unref short failure + [2024-11-27 23:10:55] iteration 396/ 1000 | consumed samples: 25344 | elapsed time per iteration (ms): 95138.1 | throughput per GPU (TFLOP/s/GPU): 81.0 | learning rate: 3.471055E-06 | global batch size: 64 | lm loss: 6.405466E-01 | loss scale: 1.0 | grad norm: 0.954 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555dec6b6640] mmco: unref short failure +[h264 @ 0x55d956f98f80] mmco: unref short failure +[h264 @ 0x555dec6b6640] mmco: unref short failure +[h264 @ 0x55d956f98f80] mmco: unref short failure +[h264 @ 0x555dec6b6640] mmco: unref short failure +[h264 @ 0x55d956f98f80] mmco: unref short failure +[h264 @ 0x555dec6b6640] mmco: unref short failure +[h264 @ 0x555dec6b6640] mmco: unref short failure +[h264 @ 0x55d956f98f80] mmco: unref short failure +[h264 @ 0x55d956f98f80] mmco: unref short failure +[h264 @ 0x555dec6b6640] mmco: unref short failure +[h264 @ 0x555dec6b6640] mmco: unref short failure +[h264 @ 0x55d956f98f80] mmco: unref short failure +[h264 @ 0x55d956f98f80] mmco: unref short failure + [2024-11-27 23:12:19] iteration 397/ 1000 | consumed samples: 25408 | elapsed time per iteration (ms): 84195.5 | throughput per GPU (TFLOP/s/GPU): 91.6 | learning rate: 3.463697E-06 | global batch size: 64 | lm loss: 6.580856E-01 | loss scale: 1.0 | grad norm: 0.762 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555ded7e1040] mmco: unref short failure +[h264 @ 0x55d956767080] mmco: unref short failure +[h264 @ 0x555ded7e1040] mmco: unref short failure +[h264 @ 0x555ded7e1040] mmco: unref short failure +[h264 @ 0x55d956767080] mmco: unref short failure +[h264 @ 0x55d956767080] mmco: unref short failure + [2024-11-27 23:14:06] iteration 398/ 1000 | consumed samples: 25472 | elapsed time per iteration (ms): 107101.7 | throughput per GPU (TFLOP/s/GPU): 72.0 | learning rate: 3.456330E-06 | global batch size: 64 | lm loss: 7.027028E-01 | loss scale: 1.0 | grad norm: 0.829 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x55d9560849c0] mmco: unref short failure +[h264 @ 0x55d9560849c0] mmco: unref short failure +[h264 @ 0x555dee6ec240] mmco: unref short failure +[h264 @ 0x555dee6ec240] mmco: unref short failure +[h264 @ 0x555deee52400] mmco: unref short failure +[h264 @ 0x555deee52400] mmco: unref short failure +[h264 @ 0x55d957a03080] mmco: unref short failure +[h264 @ 0x55d957a03080] mmco: unref short failure +[h264 @ 0x555df32c1f40] mmco: unref short failure +[h264 @ 0x555df32c1f40] mmco: unref short failure +[h264 @ 0x555df32c1f40] mmco: unref short failure +[h264 @ 0x555df32c1f40] mmco: unref short failure +[h264 @ 0x55d9586ad200] mmco: unref short failure +[h264 @ 0x55d9586ad200] mmco: unref short failure +[h264 @ 0x55d9586ad200] mmco: unref short failure +[h264 @ 0x55d9586ad200] mmco: unref short failure +[h264 @ 0x55d956767080] mmco: unref short failure +[h264 @ 0x55d956767080] mmco: unref short failure +[h264 @ 0x555ded7e1040] mmco: unref short failure +[h264 @ 0x555ded7e1040] mmco: unref short failure +[h264 @ 0x55d956767080] mmco: unref short failure +[h264 @ 0x55d956767080] mmco: unref short failure +[h264 @ 0x555ded7e1040] mmco: unref short failure +[h264 @ 0x555ded7e1040] mmco: unref short failure +[h264 @ 0x55d956767080] mmco: unref short failure +[h264 @ 0x555ded7e1040] mmco: unref short failure + [2024-11-27 23:16:59] iteration 399/ 1000 | consumed samples: 25536 | elapsed time per iteration (ms): 173130.9 | throughput per GPU (TFLOP/s/GPU): 44.5 | learning rate: 3.448953E-06 | global batch size: 64 | lm loss: 7.018545E-01 | loss scale: 1.0 | grad norm: 0.881 | number of skipped iterations: 0 | number of nan iterations: 0 | + [2024-11-27 23:18:21] iteration 400/ 1000 | consumed samples: 25600 | elapsed time per iteration (ms): 81999.1 | throughput per GPU (TFLOP/s/GPU): 94.0 | learning rate: 3.441567E-06 | global batch size: 64 | lm loss: 6.496854E-01 | loss scale: 1.0 | grad norm: 0.794 | number of skipped iterations: 0 | number of nan iterations: 0 | +(min, max) time across ranks (ms): + save-checkpoint ................................: (229487.34, 229487.67) +[h264 @ 0x555dee6ec240] mmco: unref short failure +[h264 @ 0x55d9560849c0] mmco: unref short failure +[h264 @ 0x555dee6ec240] mmco: unref short failure +[h264 @ 0x555dee6ec240] mmco: unref short failure +[h264 @ 0x55d9560849c0] mmco: unref short failure +[h264 @ 0x55d9560849c0] mmco: unref short failure +[h264 @ 0x555dee6ec240] mmco: unref short failure +[h264 @ 0x55d9560849c0] mmco: unref short failure + [2024-11-27 23:23:39] iteration 401/ 1000 | consumed samples: 25664 | elapsed time per iteration (ms): 88544.4 | throughput per GPU (TFLOP/s/GPU): 87.1 | learning rate: 3.434172E-06 | global batch size: 64 | lm loss: 8.111320E-01 | loss scale: 1.0 | grad norm: 1.285 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555dec836c00] mmco: unref short failure +[h264 @ 0x555dec836c00] mmco: unref short failure +[h264 @ 0x55d956767080] mmco: unref short failure +[h264 @ 0x55d956767080] mmco: unref short failure +[h264 @ 0x55d9560849c0] mmco: unref short failure +[h264 @ 0x55d9560849c0] mmco: unref short failure +[h264 @ 0x555dee6ec240] mmco: unref short failure +[h264 @ 0x555dee6ec240] mmco: unref short failure +[h264 @ 0x55d9560849c0] mmco: unref short failure +[h264 @ 0x55d9560849c0] mmco: unref short failure +[h264 @ 0x555dee6ec240] mmco: unref short failure +[h264 @ 0x555dee6ec240] mmco: unref short failure + [2024-11-27 23:25:00] iteration 402/ 1000 | consumed samples: 25728 | elapsed time per iteration (ms): 80221.6 | throughput per GPU (TFLOP/s/GPU): 96.1 | learning rate: 3.426767E-06 | global batch size: 64 | lm loss: 6.549807E-01 | loss scale: 1.0 | grad norm: 0.842 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555dec6b6640] mmco: unref short failure +[h264 @ 0x555dec6b6640] mmco: unref short failure +[h264 @ 0x55d956f98f80] mmco: unref short failure +[h264 @ 0x55d956f98f80] mmco: unref short failure +[h264 @ 0x555df32c1f40] mmco: unref short failure +[h264 @ 0x555df32c1f40] mmco: unref short failure +[h264 @ 0x555df32c1f40] mmco: unref short failure +[h264 @ 0x55d958e05fc0] mmco: unref short failure +[h264 @ 0x55d958e05fc0] mmco: unref short failure +[h264 @ 0x55d958e05fc0] mmco: unref short failure +[h264 @ 0x555dedc5cf80] mmco: unref short failure +[h264 @ 0x55d955d53340] mmco: unref short failure +[h264 @ 0x555dec6b6640] mmco: unref short failure +[h264 @ 0x55d956f98f80] mmco: unref short failure +[h264 @ 0x555dedc5cf80] mmco: unref short failure +[h264 @ 0x55d955d53340] mmco: unref short failure +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x55d959cd93c0] mmco: unref short failure +[h264 @ 0x55d959cd93c0] mmco: unref short failure +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x55d959cd93c0] mmco: unref short failure +[h264 @ 0x55d959cd93c0] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x55d956f36840] mmco: unref short failure +[h264 @ 0x55d956f36840] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x55d956f36840] mmco: unref short failure +[h264 @ 0x55d956f36840] mmco: unref short failure +[h264 @ 0x55d956f36840] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure + [2024-11-27 23:26:11] iteration 403/ 1000 | consumed samples: 25792 | elapsed time per iteration (ms): 71144.5 | throughput per GPU (TFLOP/s/GPU): 108.4 | learning rate: 3.419353E-06 | global batch size: 64 | lm loss: 6.545295E-01 | loss scale: 1.0 | grad norm: 0.752 | number of skipped iterations: 0 | number of nan iterations: 0 | + [2024-11-27 23:27:33] iteration 404/ 1000 | consumed samples: 25856 | elapsed time per iteration (ms): 82690.1 | throughput per GPU (TFLOP/s/GPU): 93.2 | learning rate: 3.411930E-06 | global batch size: 64 | lm loss: 6.961581E-01 | loss scale: 1.0 | grad norm: 0.867 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555dee6ec240] mmco: unref short failure +[h264 @ 0x55d9560849c0] mmco: unref short failure +[h264 @ 0x55d9560849c0] mmco: unref short failure +[h264 @ 0x555dee6ec240] mmco: unref short failure +[h264 @ 0x55d9560849c0] mmco: unref short failure +[h264 @ 0x555dee6ec240] mmco: unref short failure +[h264 @ 0x55d9560849c0] mmco: unref short failure +[h264 @ 0x55d9560849c0] mmco: unref short failure +[h264 @ 0x555dee6ec240] mmco: unref short failure +[h264 @ 0x555dee6ec240] mmco: unref short failure +[h264 @ 0x55d9560849c0] mmco: unref short failure +[h264 @ 0x555dee6ec240] mmco: unref short failure +[h264 @ 0x55d9560849c0] mmco: unref short failure +[h264 @ 0x555dee6ec240] mmco: unref short failure +[h264 @ 0x55d9567ebec0] mmco: unref short failure +[h264 @ 0x55d9567ebec0] mmco: unref short failure +[h264 @ 0x555ded679600] mmco: unref short failure +[h264 @ 0x555ded679600] mmco: unref short failure +[h264 @ 0x55d9560849c0] mmco: unref short failure +[h264 @ 0x55d9560849c0] mmco: unref short failure +[h264 @ 0x555dee6ec240] mmco: unref short failure +[h264 @ 0x555dee6ec240] mmco: unref short failure +[h264 @ 0x55d9567ebec0] mmco: unref short failure +[h264 @ 0x55d9567ebec0] mmco: unref short failure +[h264 @ 0x555ded679600] mmco: unref short failure +[h264 @ 0x555ded679600] mmco: unref short failure +[h264 @ 0x55d9567ebec0] mmco: unref short failure +[h264 @ 0x555ded679600] mmco: unref short failure +[h264 @ 0x555dee100dc0] mmco: unref short failure +[h264 @ 0x55d95c76c1c0] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x55d956f36840] mmco: unref short failure +[h264 @ 0x555dee100dc0] mmco: unref short failure +[h264 @ 0x55d95c76c1c0] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x55d956f36840] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x55d956f36840] mmco: unref short failure +[h264 @ 0x55d956f36840] mmco: unref short failure + [2024-11-27 23:29:07] iteration 405/ 1000 | consumed samples: 25920 | elapsed time per iteration (ms): 93627.9 | throughput per GPU (TFLOP/s/GPU): 82.3 | learning rate: 3.404497E-06 | global batch size: 64 | lm loss: 6.395518E-01 | loss scale: 1.0 | grad norm: 0.754 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555dee100dc0] mmco: unref short failure +[h264 @ 0x555dee100dc0] mmco: unref short failure +[h264 @ 0x55d95c76c1c0] mmco: unref short failure +[h264 @ 0x55d95c76c1c0] mmco: unref short failure +[h264 @ 0x555dee100dc0] mmco: unref short failure +[h264 @ 0x555dee100dc0] mmco: unref short failure +[h264 @ 0x55d95c76c1c0] mmco: unref short failure +[h264 @ 0x55d95c76c1c0] mmco: unref short failure +[h264 @ 0x555dee100dc0] mmco: unref short failure +[h264 @ 0x55d95c76c1c0] mmco: unref short failure +[h264 @ 0x555dee100dc0] mmco: unref short failure +[h264 @ 0x55d95c76c1c0] mmco: unref short failure +[h264 @ 0x555dee100dc0] mmco: unref short failure +[h264 @ 0x55d95c76c1c0] mmco: unref short failure +[h264 @ 0x555dee100dc0] mmco: unref short failure +[h264 @ 0x555dee100dc0] mmco: unref short failure +[h264 @ 0x55d95c76c1c0] mmco: unref short failure +[h264 @ 0x55d95c76c1c0] mmco: unref short failure + [2024-11-27 23:30:29] iteration 406/ 1000 | consumed samples: 25984 | elapsed time per iteration (ms): 81863.0 | throughput per GPU (TFLOP/s/GPU): 94.2 | learning rate: 3.397056E-06 | global batch size: 64 | lm loss: 6.411680E-01 | loss scale: 1.0 | grad norm: 1.623 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555ded1fff00] mmco: unref short failure +[h264 @ 0x555dee6ec240] mmco: unref short failure +[h264 @ 0x555dee6ec240] mmco: unref short failure +[h264 @ 0x55d9574d90c0] mmco: unref short failure +[h264 @ 0x55d9560849c0] mmco: unref short failure +[h264 @ 0x55d9560849c0] mmco: unref short failure +[h264 @ 0x555df32c1f40] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure + [2024-11-27 23:31:38] iteration 407/ 1000 | consumed samples: 26048 | elapsed time per iteration (ms): 69084.0 | throughput per GPU (TFLOP/s/GPU): 111.6 | learning rate: 3.389606E-06 | global batch size: 64 | lm loss: 6.546772E-01 | loss scale: 1.0 | grad norm: 0.769 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x555df32c1f40] mmco: unref short failure +[h264 @ 0x555df32c1f40] mmco: unref short failure +[h264 @ 0x555dee6ec240] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x55d9560849c0] mmco: unref short failure +[h264 @ 0x55d956f36840] mmco: unref short failure +[h264 @ 0x555deee52400] mmco: unref short failure +[h264 @ 0x555deee52400] mmco: unref short failure +[h264 @ 0x55d95678a600] mmco: unref short failure +[h264 @ 0x55d95678a600] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x55d956defa80] mmco: unref short failure +[h264 @ 0x55d956defa80] mmco: unref short failure +[h264 @ 0x55d956defa80] mmco: unref short failure +[h264 @ 0x55d956defa80] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x55d956defa80] mmco: unref short failure +[h264 @ 0x55d956defa80] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x55d956f36840] mmco: unref short failure + [2024-11-27 23:33:10] iteration 408/ 1000 | consumed samples: 26112 | elapsed time per iteration (ms): 91986.0 | throughput per GPU (TFLOP/s/GPU): 83.8 | learning rate: 3.382147E-06 | global batch size: 64 | lm loss: 6.975461E-01 | loss scale: 1.0 | grad norm: 1.242 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555df32c1f40] mmco: unref short failure +[h264 @ 0x555df32c1f40] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure + [2024-11-27 23:34:37] iteration 409/ 1000 | consumed samples: 26176 | elapsed time per iteration (ms): 87219.4 | throughput per GPU (TFLOP/s/GPU): 88.4 | learning rate: 3.374680E-06 | global batch size: 64 | lm loss: 6.866903E-01 | loss scale: 1.0 | grad norm: 0.903 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555dedc5cf80] mmco: unref short failure +[h264 @ 0x555dedc5cf80] mmco: unref short failure +[h264 @ 0x55d956d792c0] mmco: unref short failure +[h264 @ 0x55d956d792c0] mmco: unref short failure +[h264 @ 0x555dedc5cf80] mmco: unref short failure +[h264 @ 0x555dedc5cf80] mmco: unref short failure +[h264 @ 0x55d956d792c0] mmco: unref short failure +[h264 @ 0x55d956d792c0] mmco: unref short failure +processed_samples 1500 unjoint_samples 1500 joint_samples 104 [83603, 101734] +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +processed_samples 1500 unjoint_samples 1500 joint_samples 104 [83603, 101734] +[h264 @ 0x555df32c1f40] mmco: unref short failure +[h264 @ 0x555df32c1f40] mmco: unref short failure +processed_samples 1500 unjoint_samples 1500 joint_samples 99 [99965, 39100] +processed_samples 1500 unjoint_samples 1500 joint_samples 99 [99965, 39100] +processed_samples 1500 unjoint_samples 1500 joint_samples 98 [115630, 100088] +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x555df32c1f40] mmco: unref short failure +[h264 @ 0x555df32c1f40] mmco: unref short failure +processed_samples 1500 unjoint_samples 1500 joint_samples 98 [115630, 100088] +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x555df32c1f40] mmco: unref short failure +[h264 @ 0x55d959cd93c0] mmco: unref short failure +[h264 @ 0x55d959cd93c0] mmco: unref short failure +[h264 @ 0x55d959cd93c0] mmco: unref short failure +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x55d959cd93c0] mmco: unref short failure +[h264 @ 0x555df4169bc0] mmco: unref short failure +processed_samples 1500 unjoint_samples 1500 joint_samples 96 [46295, 93229] +processed_samples 1500 unjoint_samples 1500 joint_samples 96 [46295, 93229] + [2024-11-27 23:36:20] iteration 410/ 1000 | consumed samples: 26240 | elapsed time per iteration (ms): 102666.0 | throughput per GPU (TFLOP/s/GPU): 75.1 | learning rate: 3.367203E-06 | global batch size: 64 | lm loss: 6.706418E-01 | loss scale: 1.0 | grad norm: 1.213 | number of skipped iterations: 0 | number of nan iterations: 0 | +processed_samples 1500 unjoint_samples 1500 joint_samples 111 [110989, 83124] +processed_samples 1500 unjoint_samples 1500 joint_samples 111 [110989, 83124] +processed_samples 1500 unjoint_samples 1500 joint_samples 105 [128523, 100177] +processed_samples 1500 unjoint_samples 1500 joint_samples 105 [128523, 100177] +processed_samples 1500 unjoint_samples 1500 joint_samples 99 [113112, 123890] +processed_samples 1500 unjoint_samples 1500 joint_samples 99 [113112, 123890] +processed_samples 1500 unjoint_samples 1500 joint_samples 109 [107974, 121638] +processed_samples 1500 unjoint_samples 1500 joint_samples 109 [107974, 121638] +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x55d959cd93c0] mmco: unref short failure +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x55d959cd93c0] mmco: unref short failure +[h264 @ 0x555df32c1f40] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x555df32c1f40] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x55d959cd93c0] mmco: unref short failure +[h264 @ 0x55d959cd93c0] mmco: unref short failure +[h264 @ 0x555df32c1f40] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure + [2024-11-27 23:37:52] iteration 411/ 1000 | consumed samples: 26304 | elapsed time per iteration (ms): 91989.4 | throughput per GPU (TFLOP/s/GPU): 83.8 | learning rate: 3.359719E-06 | global batch size: 64 | lm loss: 6.757123E-01 | loss scale: 1.0 | grad norm: 1.033 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555df32c1f40] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure + [2024-11-27 23:41:09] iteration 412/ 1000 | consumed samples: 26368 | elapsed time per iteration (ms): 196820.6 | throughput per GPU (TFLOP/s/GPU): 39.2 | learning rate: 3.352225E-06 | global batch size: 64 | lm loss: 6.460667E-01 | loss scale: 1.0 | grad norm: 0.960 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x55d956defa80] mmco: unref short failure +[h264 @ 0x55d956defa80] mmco: unref short failure +[h264 @ 0x55d956defa80] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x55d956defa80] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x555df32c1f40] mmco: unref short failure +[h264 @ 0x555df32c1f40] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x555df32c1f40] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x555df32c1f40] mmco: unref short failure +[h264 @ 0x555df32c1f40] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x555df32c1f40] mmco: unref short failure +[h264 @ 0x555df32c1f40] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure + [2024-11-27 23:43:18] iteration 413/ 1000 | consumed samples: 26432 | elapsed time per iteration (ms): 129341.2 | throughput per GPU (TFLOP/s/GPU): 59.6 | learning rate: 3.344724E-06 | global batch size: 64 | lm loss: 6.627863E-01 | loss scale: 1.0 | grad norm: 0.784 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x55d9560849c0] mmco: unref short failure +[h264 @ 0x55d9560849c0] mmco: unref short failure +[h264 @ 0x555dee6ec240] mmco: unref short failure +[h264 @ 0x555dee6ec240] mmco: unref short failure +[h264 @ 0x55d9560849c0] mmco: unref short failure +[h264 @ 0x55d9560849c0] mmco: unref short failure +[h264 @ 0x555dee6ec240] mmco: unref short failure +[h264 @ 0x555dee6ec240] mmco: unref short failure +[h264 @ 0x555dee6ec240] mmco: unref short failure +[h264 @ 0x555dee6ec240] mmco: unref short failure +[h264 @ 0x55d9560849c0] mmco: unref short failure +[h264 @ 0x55d9560849c0] mmco: unref short failure +[h264 @ 0x555dee6ec240] mmco: unref short failure +[h264 @ 0x555dee6ec240] mmco: unref short failure +[h264 @ 0x55d9560849c0] mmco: unref short failure +[h264 @ 0x55d9560849c0] mmco: unref short failure + [2024-11-27 23:44:34] iteration 414/ 1000 | consumed samples: 26496 | elapsed time per iteration (ms): 75545.7 | throughput per GPU (TFLOP/s/GPU): 102.0 | learning rate: 3.337214E-06 | global batch size: 64 | lm loss: 6.712230E-01 | loss scale: 1.0 | grad norm: 0.837 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x55d957a91640] mmco: unref short failure +[h264 @ 0x555df333fac0] mmco: unref short failure + [2024-11-27 23:46:07] iteration 415/ 1000 | consumed samples: 26560 | elapsed time per iteration (ms): 93195.6 | throughput per GPU (TFLOP/s/GPU): 82.7 | learning rate: 3.329695E-06 | global batch size: 64 | lm loss: 7.355238E-01 | loss scale: 1.0 | grad norm: 0.849 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x55d9570eaac0] mmco: unref short failure +[h264 @ 0x555dece9f180] mmco: unref short failure +[h264 @ 0x555dece9f180] mmco: unref short failure +[h264 @ 0x55d95827cc40] mmco: unref short failure +[h264 @ 0x55d95827cc40] mmco: unref short failure +[h264 @ 0x555dece9f180] mmco: unref short failure +[h264 @ 0x555dece9f180] mmco: unref short failure +[h264 @ 0x55d95827cc40] mmco: unref short failure +[h264 @ 0x55d95827cc40] mmco: unref short failure +[h264 @ 0x555dece9f180] mmco: unref short failure +[h264 @ 0x55d95827cc40] mmco: unref short failure +[h264 @ 0x555dece9f180] mmco: unref short failure +[h264 @ 0x555dece9f180] mmco: unref short failure +[h264 @ 0x55d95827cc40] mmco: unref short failure +[h264 @ 0x55d95827cc40] mmco: unref short failure +[h264 @ 0x555dece9f180] mmco: unref short failure +[h264 @ 0x55d95827cc40] mmco: unref short failure +[h264 @ 0x555dece9f180] mmco: unref short failure +[h264 @ 0x55d95827cc40] mmco: unref short failure +[h264 @ 0x555dece9f180] mmco: unref short failure +[h264 @ 0x555dece9f180] mmco: unref short failure +[h264 @ 0x55d95827cc40] mmco: unref short failure +[h264 @ 0x55d95827cc40] mmco: unref short failure +[h264 @ 0x555dece9f180] mmco: unref short failure +[h264 @ 0x555dece9f180] mmco: unref short failure +[h264 @ 0x555dece9f180] mmco: unref short failure +[h264 @ 0x55d95827cc40] mmco: unref short failure +[h264 @ 0x55d95827cc40] mmco: unref short failure +[h264 @ 0x55d95827cc40] mmco: unref short failure +[h264 @ 0x555dece9f180] mmco: unref short failure +[h264 @ 0x555dece9f180] mmco: unref short failure +[h264 @ 0x55d95827cc40] mmco: unref short failure +[h264 @ 0x55d95827cc40] mmco: unref short failure + [2024-11-27 23:47:22] iteration 416/ 1000 | consumed samples: 26624 | elapsed time per iteration (ms): 75505.2 | throughput per GPU (TFLOP/s/GPU): 102.1 | learning rate: 3.322169E-06 | global batch size: 64 | lm loss: 6.778505E-01 | loss scale: 1.0 | grad norm: 0.885 | number of skipped iterations: 0 | number of nan iterations: 0 | + [2024-11-27 23:48:37] iteration 417/ 1000 | consumed samples: 26688 | elapsed time per iteration (ms): 74429.2 | throughput per GPU (TFLOP/s/GPU): 103.6 | learning rate: 3.314634E-06 | global batch size: 64 | lm loss: 6.249003E-01 | loss scale: 1.0 | grad norm: 0.886 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555dee570b40] mmco: unref short failure +[h264 @ 0x555dee570b40] mmco: unref short failure +[h264 @ 0x55d956042040] mmco: unref short failure +[h264 @ 0x55d956042040] mmco: unref short failure + [2024-11-27 23:49:49] iteration 418/ 1000 | consumed samples: 26752 | elapsed time per iteration (ms): 72617.8 | throughput per GPU (TFLOP/s/GPU): 106.2 | learning rate: 3.307092E-06 | global batch size: 64 | lm loss: 7.011807E-01 | loss scale: 1.0 | grad norm: 0.846 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x55d9586ad200] mmco: unref short failure +[h264 @ 0x55d9586ad200] mmco: unref short failure +[h264 @ 0x555dee570b40] mmco: unref short failure +[h264 @ 0x555dee570b40] mmco: unref short failure +[h264 @ 0x555dece9f180] mmco: unref short failure +[h264 @ 0x555dece9f180] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x555dee6b8180] mmco: unref short failure +[h264 @ 0x55d95827cc40] mmco: unref short failure +[h264 @ 0x55d95827cc40] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x555dee6b8180] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x555dee588f00] mmco: unref short failure +[h264 @ 0x555dee588f00] mmco: unref short failure +[h264 @ 0x555dece9f180] mmco: unref short failure +[h264 @ 0x555dece9f180] mmco: unref short failure +[h264 @ 0x55d95827cc40] mmco: unref short failure +[h264 @ 0x55d95827cc40] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x555dee588f00] mmco: unref short failure + [2024-11-27 23:51:04] iteration 419/ 1000 | consumed samples: 26816 | elapsed time per iteration (ms): 74084.5 | throughput per GPU (TFLOP/s/GPU): 104.1 | learning rate: 3.299541E-06 | global batch size: 64 | lm loss: 6.895205E-01 | loss scale: 1.0 | grad norm: 0.944 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x555dee588f00] mmco: unref short failure +[h264 @ 0x555def216fc0] mmco: unref short failure +[h264 @ 0x55d9593709c0] mmco: unref short failure +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x55d959b34680] mmco: unref short failure +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x55d959b34680] mmco: unref short failure +[h264 @ 0x55d959b34680] mmco: unref short failure +[h264 @ 0x55d956b4f800] mmco: unref short failure +[h264 @ 0x555dee588f00] mmco: unref short failure + [2024-11-27 23:52:18] iteration 420/ 1000 | consumed samples: 26880 | elapsed time per iteration (ms): 74639.6 | throughput per GPU (TFLOP/s/GPU): 103.3 | learning rate: 3.291983E-06 | global batch size: 64 | lm loss: 6.247544E-01 | loss scale: 1.0 | grad norm: 0.837 | number of skipped iterations: 0 | number of nan iterations: 0 | + [2024-11-27 23:53:40] iteration 421/ 1000 | consumed samples: 26944 | elapsed time per iteration (ms): 81462.5 | throughput per GPU (TFLOP/s/GPU): 94.6 | learning rate: 3.284416E-06 | global batch size: 64 | lm loss: 6.611052E-01 | loss scale: 1.0 | grad norm: 0.900 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x55d959b34680] mmco: unref short failure +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x55d959b34680] mmco: unref short failure +[h264 @ 0x55d9593709c0] mmco: unref short failure +[h264 @ 0x55d9593709c0] mmco: unref short failure +[h264 @ 0x555def216fc0] mmco: unref short failure +[h264 @ 0x555def216fc0] mmco: unref short failure + [2024-11-27 23:54:56] iteration 422/ 1000 | consumed samples: 27008 | elapsed time per iteration (ms): 76765.4 | throughput per GPU (TFLOP/s/GPU): 100.4 | learning rate: 3.276843E-06 | global batch size: 64 | lm loss: 7.140992E-01 | loss scale: 1.0 | grad norm: 0.845 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x55d959b34680] mmco: unref short failure +[h264 @ 0x55d959b34680] mmco: unref short failure + [2024-11-27 23:56:17] iteration 423/ 1000 | consumed samples: 27072 | elapsed time per iteration (ms): 80528.3 | throughput per GPU (TFLOP/s/GPU): 95.7 | learning rate: 3.269261E-06 | global batch size: 64 | lm loss: 6.346519E-01 | loss scale: 1.0 | grad norm: 0.801 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555dec5a5140] mmco: unref short failure +[h264 @ 0x555dec5a5140] mmco: unref short failure +[h264 @ 0x55d956aae040] mmco: unref short failure +[h264 @ 0x55d956aae040] mmco: unref short failure +[h264 @ 0x555df32c1f40] mmco: unref short failure +[h264 @ 0x555df32c1f40] mmco: unref short failure +[h264 @ 0x55d9560aea40] mmco: unref short failure +[h264 @ 0x55d9560aea40] mmco: unref short failure +[h264 @ 0x55d956aae040] mmco: unref short failure +[h264 @ 0x555dec5a5140] mmco: unref short failure +[h264 @ 0x555decea5e80] mmco: unref short failure +[h264 @ 0x555decea5e80] mmco: unref short failure +[h264 @ 0x55d9581bc7c0] mmco: unref short failure +[h264 @ 0x55d9581bc7c0] mmco: unref short failure +[h264 @ 0x55d956aae040] mmco: unref short failure +[h264 @ 0x55d956aae040] mmco: unref short failure +[h264 @ 0x555dec5a5140] mmco: unref short failure +[h264 @ 0x555dec5a5140] mmco: unref short failure +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x55d959b34680] mmco: unref short failure + [2024-11-27 23:57:33] iteration 424/ 1000 | consumed samples: 27136 | elapsed time per iteration (ms): 76346.0 | throughput per GPU (TFLOP/s/GPU): 101.0 | learning rate: 3.261672E-06 | global batch size: 64 | lm loss: 6.726185E-01 | loss scale: 1.0 | grad norm: 0.885 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555dee588f00] mmco: unref short failure +[h264 @ 0x555dee588f00] mmco: unref short failure +[h264 @ 0x555decb36640] mmco: unref short failure +[h264 @ 0x555decb36640] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x55d955b8bcc0] mmco: unref short failure +[h264 @ 0x55d955b8bcc0] mmco: unref short failure +[h264 @ 0x555dee588f00] mmco: unref short failure +[h264 @ 0x555dee588f00] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x555dee588f00] mmco: unref short failure +[h264 @ 0x555dee588f00] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure + [2024-11-27 23:59:47] iteration 425/ 1000 | consumed samples: 27200 | elapsed time per iteration (ms): 133679.8 | throughput per GPU (TFLOP/s/GPU): 57.7 | learning rate: 3.254075E-06 | global batch size: 64 | lm loss: 7.108432E-01 | loss scale: 1.0 | grad norm: 0.887 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x555dee570b40] mmco: unref short failure +[h264 @ 0x555dee570b40] mmco: unref short failure +[h264 @ 0x55d959b34680] mmco: unref short failure +[h264 @ 0x55d957cd6140] mmco: unref short failure +[h264 @ 0x55d957cd6140] mmco: unref short failure +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x555dee588f00] mmco: unref short failure +[h264 @ 0x55d959b34680] mmco: unref short failure +[h264 @ 0x55d959b34680] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x55d959b34680] mmco: unref short failure +[h264 @ 0x55d959b34680] mmco: unref short failure +[h264 @ 0x555dee588f00] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x555dee570b40] mmco: unref short failure +[h264 @ 0x555dee570b40] mmco: unref short failure +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x55d957cd6140] mmco: unref short failure +[h264 @ 0x55d957cd6140] mmco: unref short failure +[h264 @ 0x55d959b34680] mmco: unref short failure +[h264 @ 0x555dee588f00] mmco: unref short failure +[h264 @ 0x555dee588f00] mmco: unref short failure +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x55d959b34680] mmco: unref short failure +[h264 @ 0x555dee588f00] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x555dee588f00] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x55d959b34680] mmco: unref short failure +[h264 @ 0x55d959b34680] mmco: unref short failure +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x55d959b34680] mmco: unref short failure +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x55d959b34680] mmco: unref short failure + [2024-11-28 00:01:09] iteration 426/ 1000 | consumed samples: 27264 | elapsed time per iteration (ms): 82143.2 | throughput per GPU (TFLOP/s/GPU): 93.8 | learning rate: 3.246472E-06 | global batch size: 64 | lm loss: 6.866354E-01 | loss scale: 1.0 | grad norm: 0.762 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x55d957f0f640] mmco: unref short failure +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x55d957f0f640] mmco: unref short failure +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x55d957f0f640] mmco: unref short failure +[h264 @ 0x55d957f0f640] mmco: unref short failure +[h264 @ 0x55d957f0f640] mmco: unref short failure +[h264 @ 0x55d957f0f640] mmco: unref short failure +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x55d957f0f640] mmco: unref short failure +[h264 @ 0x55d957f0f640] mmco: unref short failure +[h264 @ 0x55d957f0f640] mmco: unref short failure +[h264 @ 0x55d957f0f640] mmco: unref short failure +[h264 @ 0x555dece38b80] mmco: unref short failure +[h264 @ 0x555dece38b80] mmco: unref short failure +[h264 @ 0x55d9581bc7c0] mmco: unref short failure +[h264 @ 0x55d9581bc7c0] mmco: unref short failure +[h264 @ 0x555df32c1f40] mmco: unref short failure +[h264 @ 0x555df32c1f40] mmco: unref short failure +[h264 @ 0x55d956f36840] mmco: unref short failure +[h264 @ 0x55d956f36840] mmco: unref short failure +[h264 @ 0x555deee52400] mmco: unref short failure +[h264 @ 0x555deee52400] mmco: unref short failure +[h264 @ 0x555deee52400] mmco: unref short failure +[h264 @ 0x555deee52400] mmco: unref short failure +[h264 @ 0x55d9567ebec0] mmco: unref short failure +[h264 @ 0x55d9567ebec0] mmco: unref short failure +[h264 @ 0x55d9567ebec0] mmco: unref short failure +[h264 @ 0x55d9567ebec0] mmco: unref short failure +[h264 @ 0x555deee52400] mmco: unref short failure +[h264 @ 0x55d9567ebec0] mmco: unref short failure + [2024-11-28 00:02:27] iteration 427/ 1000 | consumed samples: 27328 | elapsed time per iteration (ms): 77428.6 | throughput per GPU (TFLOP/s/GPU): 99.6 | learning rate: 3.238860E-06 | global batch size: 64 | lm loss: 6.865810E-01 | loss scale: 1.0 | grad norm: 0.855 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555deee52400] mmco: unref short failure +[h264 @ 0x555deee52400] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x555deee52400] mmco: unref short failure +[h264 @ 0x555deee52400] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure + [2024-11-28 00:03:41] iteration 428/ 1000 | consumed samples: 27392 | elapsed time per iteration (ms): 74443.4 | throughput per GPU (TFLOP/s/GPU): 103.5 | learning rate: 3.231242E-06 | global batch size: 64 | lm loss: 6.732894E-01 | loss scale: 1.0 | grad norm: 0.858 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555dee100dc0] mmco: unref short failure +[h264 @ 0x555dee100dc0] mmco: unref short failure +[h264 @ 0x55d956f36840] mmco: unref short failure +[h264 @ 0x55d956f36840] mmco: unref short failure +[h264 @ 0x555df32c1f40] mmco: unref short failure +[h264 @ 0x555df32c1f40] mmco: unref short failure +[h264 @ 0x55d956f36840] mmco: unref short failure +[h264 @ 0x55d956f36840] mmco: unref short failure +[h264 @ 0x555df32c1f40] mmco: unref short failure +[h264 @ 0x555df32c1f40] mmco: unref short failure +[h264 @ 0x55d956f36840] mmco: unref short failure +[h264 @ 0x55d956f36840] mmco: unref short failure + [2024-11-28 00:05:10] iteration 429/ 1000 | consumed samples: 27456 | elapsed time per iteration (ms): 88691.8 | throughput per GPU (TFLOP/s/GPU): 86.9 | learning rate: 3.223616E-06 | global batch size: 64 | lm loss: 5.977175E-01 | loss scale: 1.0 | grad norm: 0.753 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555deee52400] mmco: unref short failure +[h264 @ 0x555deee52400] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure + [2024-11-28 00:06:37] iteration 430/ 1000 | consumed samples: 27520 | elapsed time per iteration (ms): 87480.3 | throughput per GPU (TFLOP/s/GPU): 88.1 | learning rate: 3.215984E-06 | global batch size: 64 | lm loss: 6.304359E-01 | loss scale: 1.0 | grad norm: 1.090 | number of skipped iterations: 0 | number of nan iterations: 0 | + [2024-11-28 00:07:58] iteration 431/ 1000 | consumed samples: 27584 | elapsed time per iteration (ms): 80605.4 | throughput per GPU (TFLOP/s/GPU): 95.6 | learning rate: 3.208344E-06 | global batch size: 64 | lm loss: 6.228154E-01 | loss scale: 1.0 | grad norm: 0.874 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555df4961e80] mmco: unref short failure +[h264 @ 0x555df4961e80] mmco: unref short failure +[h264 @ 0x55d959cd93c0] mmco: unref short failure +[h264 @ 0x55d959cd93c0] mmco: unref short failure +[h264 @ 0x555df32c1f40] mmco: unref short failure +[h264 @ 0x555df32c1f40] mmco: unref short failure +[h264 @ 0x55d95eeb7b40] mmco: unref short failure +[h264 @ 0x55d95eeb7b40] mmco: unref short failure +[h264 @ 0x555df32c1f40] mmco: unref short failure +[h264 @ 0x555df32c1f40] mmco: unref short failure +[h264 @ 0x55d95eeb7b40] mmco: unref short failure +[h264 @ 0x55d95eeb7b40] mmco: unref short failure +[h264 @ 0x555df32c1f40] mmco: unref short failure +[h264 @ 0x555df32c1f40] mmco: unref short failure +[h264 @ 0x55d95eeb7b40] mmco: unref short failure +[h264 @ 0x55d95eeb7b40] mmco: unref short failure +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x55d9567ebec0] mmco: unref short failure +[h264 @ 0x55d9567ebec0] mmco: unref short failure +[h264 @ 0x555df32c1f40] mmco: unref short failure +[h264 @ 0x55d95eeb7b40] mmco: unref short failure + [2024-11-28 00:09:08] iteration 432/ 1000 | consumed samples: 27648 | elapsed time per iteration (ms): 69948.3 | throughput per GPU (TFLOP/s/GPU): 110.2 | learning rate: 3.200697E-06 | global batch size: 64 | lm loss: 6.354905E-01 | loss scale: 1.0 | grad norm: 1.019 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555dedfc6580] mmco: unref short failure +[h264 @ 0x555dedfc6580] mmco: unref short failure +[h264 @ 0x55d957fe7780] mmco: unref short failure +[h264 @ 0x55d957fe7780] mmco: unref short failure +[h264 @ 0x555dedfc6580] mmco: unref short failure +[h264 @ 0x555dedfc6580] mmco: unref short failure +[h264 @ 0x55d957fe7780] mmco: unref short failure +[h264 @ 0x55d957fe7780] mmco: unref short failure +[h264 @ 0x555dedfc6580] mmco: unref short failure +[h264 @ 0x555dedfc6580] mmco: unref short failure +[h264 @ 0x55d957fe7780] mmco: unref short failure +[h264 @ 0x55d957fe7780] mmco: unref short failure +[h264 @ 0x55d957fe7780] mmco: unref short failure +[h264 @ 0x555dedfc6580] mmco: unref short failure +[h264 @ 0x555dedfc6580] mmco: unref short failure +[h264 @ 0x555dedfc6580] mmco: unref short failure +[h264 @ 0x55d957fe7780] mmco: unref short failure +[h264 @ 0x55d957fe7780] mmco: unref short failure +[h264 @ 0x555dee8137c0] mmco: unref short failure +[h264 @ 0x55d9569edbc0] mmco: unref short failure +[h264 @ 0x555dedfc6580] mmco: unref short failure +[h264 @ 0x555dedfc6580] mmco: unref short failure +[h264 @ 0x55d957fe7780] mmco: unref short failure +[h264 @ 0x55d957fe7780] mmco: unref short failure +[h264 @ 0x555dee8137c0] mmco: unref short failure +[h264 @ 0x555dee8137c0] mmco: unref short failure +[h264 @ 0x55d9569edbc0] mmco: unref short failure +[h264 @ 0x55d9569edbc0] mmco: unref short failure + [2024-11-28 00:10:45] iteration 433/ 1000 | consumed samples: 27712 | elapsed time per iteration (ms): 96983.5 | throughput per GPU (TFLOP/s/GPU): 79.5 | learning rate: 3.193044E-06 | global batch size: 64 | lm loss: 6.355907E-01 | loss scale: 1.0 | grad norm: 0.977 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555dedb7a200] mmco: unref short failure +[h264 @ 0x555dedb7a200] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x555dedb7a200] mmco: unref short failure +[h264 @ 0x555dedb7a200] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x555dedb7a200] mmco: unref short failure +[h264 @ 0x555dedb7a200] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x555dedb7a200] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x555dedb7a200] mmco: unref short failure +[h264 @ 0x555dedb7a200] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x555dee1d4480] mmco: unref short failure +[h264 @ 0x555dee1d4480] mmco: unref short failure +[h264 @ 0x55d957a91640] mmco: unref short failure +[h264 @ 0x55d957a91640] mmco: unref short failure +[h264 @ 0x555decb36640] mmco: unref short failure +[h264 @ 0x555decb36640] mmco: unref short failure +[h264 @ 0x55d95678a600] mmco: unref short failure +[h264 @ 0x55d95678a600] mmco: unref short failure +[h264 @ 0x555decb36640] mmco: unref short failure +[h264 @ 0x555decb36640] mmco: unref short failure +[h264 @ 0x55d95678a600] mmco: unref short failure +[h264 @ 0x55d95678a600] mmco: unref short failure +[h264 @ 0x555decb36640] mmco: unref short failure +[h264 @ 0x555decb36640] mmco: unref short failure +[h264 @ 0x55d95678a600] mmco: unref short failure +[h264 @ 0x55d95678a600] mmco: unref short failure + [2024-11-28 00:12:07] iteration 434/ 1000 | consumed samples: 27776 | elapsed time per iteration (ms): 82614.7 | throughput per GPU (TFLOP/s/GPU): 93.3 | learning rate: 3.185384E-06 | global batch size: 64 | lm loss: 6.905267E-01 | loss scale: 1.0 | grad norm: 0.725 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x55d9567ebec0] mmco: unref short failure +[h264 @ 0x55d9567ebec0] mmco: unref short failure +[h264 @ 0x555dee948bc0] mmco: unref short failure +[h264 @ 0x555dee948bc0] mmco: unref short failure +[h264 @ 0x55d957fe7780] mmco: unref short failure +[h264 @ 0x555dedfc6580] mmco: unref short failure +[h264 @ 0x555dedfc6580] mmco: unref short failure +[h264 @ 0x55d957fe7780] mmco: unref short failure + [2024-11-28 00:13:50] iteration 435/ 1000 | consumed samples: 27840 | elapsed time per iteration (ms): 102409.5 | throughput per GPU (TFLOP/s/GPU): 75.3 | learning rate: 3.177717E-06 | global batch size: 64 | lm loss: 6.286079E-01 | loss scale: 1.0 | grad norm: 0.714 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x55d957fe7780] mmco: unref short failure +[h264 @ 0x555dedfc6580] mmco: unref short failure +[h264 @ 0x55d957fe7780] mmco: unref short failure +[h264 @ 0x55d957fe7780] mmco: unref short failure +[h264 @ 0x555dedfc6580] mmco: unref short failure +[h264 @ 0x555dedfc6580] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x555def251b40] mmco: unref short failure +[h264 @ 0x555def251b40] mmco: unref short failure + [2024-11-28 00:15:18] iteration 436/ 1000 | consumed samples: 27904 | elapsed time per iteration (ms): 88263.1 | throughput per GPU (TFLOP/s/GPU): 87.3 | learning rate: 3.170044E-06 | global batch size: 64 | lm loss: 6.589005E-01 | loss scale: 1.0 | grad norm: 0.858 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x55d956b88f40] mmco: unref short failure +[h264 @ 0x55d956b88f40] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x55d956b88f40] mmco: unref short failure +[h264 @ 0x55d956b88f40] mmco: unref short failure +[h264 @ 0x55d956b88f40] mmco: unref short failure +[h264 @ 0x55d956b88f40] mmco: unref short failure + [2024-11-28 00:16:55] iteration 437/ 1000 | consumed samples: 27968 | elapsed time per iteration (ms): 96811.4 | throughput per GPU (TFLOP/s/GPU): 79.6 | learning rate: 3.162364E-06 | global batch size: 64 | lm loss: 6.647096E-01 | loss scale: 1.0 | grad norm: 0.826 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x55d95678a600] mmco: unref short failure +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x555dee1d4480] mmco: unref short failure +[h264 @ 0x555df0bdeb40] mmco: unref short failure +[h264 @ 0x555dee1d4480] mmco: unref short failure +[h264 @ 0x55d957a91640] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x55d957a91640] mmco: unref short failure +[h264 @ 0x555df0bdeb40] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x555decea5e80] mmco: unref short failure +[h264 @ 0x555decea5e80] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x555decea5e80] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x555decea5e80] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x555decea5e80] mmco: unref short failure +[h264 @ 0x555decea5e80] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x555decea5e80] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x555decea5e80] mmco: unref short failure +[h264 @ 0x555decea5e80] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure + [2024-11-28 00:18:31] iteration 438/ 1000 | consumed samples: 28032 | elapsed time per iteration (ms): 95986.1 | throughput per GPU (TFLOP/s/GPU): 80.3 | learning rate: 3.154678E-06 | global batch size: 64 | lm loss: 6.228602E-01 | loss scale: 1.0 | grad norm: 0.879 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555def216fc0] mmco: unref short failure +[h264 @ 0x555deda7a5c0] mmco: unref short failure +[h264 @ 0x555deda7a5c0] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x555dedfc6580] mmco: unref short failure +[h264 @ 0x555dedfc6580] mmco: unref short failure +[h264 @ 0x55d957fe7780] mmco: unref short failure +[h264 @ 0x55d957fe7780] mmco: unref short failure +[h264 @ 0x555deda7a5c0] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x555dedfc6580] mmco: unref short failure +[h264 @ 0x555dedfc6580] mmco: unref short failure +[h264 @ 0x55d957fe7780] mmco: unref short failure +[h264 @ 0x55d957fe7780] mmco: unref short failure +[h264 @ 0x555dedfc6580] mmco: unref short failure +[h264 @ 0x55d957fe7780] mmco: unref short failure + [2024-11-28 00:20:04] iteration 439/ 1000 | consumed samples: 28096 | elapsed time per iteration (ms): 93509.2 | throughput per GPU (TFLOP/s/GPU): 82.4 | learning rate: 3.146985E-06 | global batch size: 64 | lm loss: 6.783546E-01 | loss scale: 1.0 | grad norm: 0.960 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555deee52400] mmco: unref short failure +[h264 @ 0x555deee52400] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x55d959cd93c0] mmco: unref short failure +[h264 @ 0x55d959cd93c0] mmco: unref short failure +[h264 @ 0x555dec5a5140] mmco: unref short failure +[h264 @ 0x555dec5a5140] mmco: unref short failure +[h264 @ 0x55d959cd93c0] mmco: unref short failure +[h264 @ 0x555dec5a5140] mmco: unref short failure +[h264 @ 0x55d959cd93c0] mmco: unref short failure +[h264 @ 0x555dec5a5140] mmco: unref short failure + [2024-11-28 00:22:07] iteration 440/ 1000 | consumed samples: 28160 | elapsed time per iteration (ms): 122703.1 | throughput per GPU (TFLOP/s/GPU): 62.8 | learning rate: 3.139286E-06 | global batch size: 64 | lm loss: 6.260735E-01 | loss scale: 1.0 | grad norm: 0.852 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x55d957fe7780] mmco: unref short failure +[h264 @ 0x55d957fe7780] mmco: unref short failure +[h264 @ 0x555dedfc6580] mmco: unref short failure +[h264 @ 0x555dedfc6580] mmco: unref short failure +[h264 @ 0x55d957fe7780] mmco: unref short failure +[h264 @ 0x55d957fe7780] mmco: unref short failure +[h264 @ 0x555dedfc6580] mmco: unref short failure +[h264 @ 0x555dedfc6580] mmco: unref short failure +[h264 @ 0x555dedfc6580] mmco: unref short failure +[h264 @ 0x555dedfc6580] mmco: unref short failure +[h264 @ 0x55d957fe7780] mmco: unref short failure +[h264 @ 0x55d957fe7780] mmco: unref short failure + [2024-11-28 00:23:26] iteration 441/ 1000 | consumed samples: 28224 | elapsed time per iteration (ms): 78888.0 | throughput per GPU (TFLOP/s/GPU): 97.7 | learning rate: 3.131581E-06 | global batch size: 64 | lm loss: 7.180869E-01 | loss scale: 1.0 | grad norm: 0.887 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555deee52400] mmco: unref short failure +[h264 @ 0x555deee52400] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x555deee52400] mmco: unref short failure +[h264 @ 0x555deee52400] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x555deee52400] mmco: unref short failure +[h264 @ 0x555deee52400] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x555deee52400] mmco: unref short failure +[h264 @ 0x555deee52400] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x555deee52400] mmco: unref short failure +[h264 @ 0x555deee52400] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x55d956042040] mmco: unref short failure +[h264 @ 0x55d956042040] mmco: unref short failure +[h264 @ 0x555deee52400] mmco: unref short failure +[h264 @ 0x555deee52400] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x55d956042040] mmco: unref short failure +[h264 @ 0x55d956042040] mmco: unref short failure +[h264 @ 0x555deee52400] mmco: unref short failure +[h264 @ 0x555deee52400] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x55d956042040] mmco: unref short failure +[h264 @ 0x55d956042040] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x55d956042040] mmco: unref short failure +[h264 @ 0x55d956042040] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x555dece38b80] mmco: unref short failure +[h264 @ 0x555dece38b80] mmco: unref short failure +[h264 @ 0x55d9593709c0] mmco: unref short failure +[h264 @ 0x55d9593709c0] mmco: unref short failure +[h264 @ 0x555ded7e1040] mmco: unref short failure +[h264 @ 0x555ded7e1040] mmco: unref short failure +[h264 @ 0x55d9560849c0] mmco: unref short failure +[h264 @ 0x55d9560849c0] mmco: unref short failure +[h264 @ 0x555ded7e1040] mmco: unref short failure +[h264 @ 0x555ded7e1040] mmco: unref short failure +[h264 @ 0x555dece38b80] mmco: unref short failure +[h264 @ 0x555dece38b80] mmco: unref short failure +[h264 @ 0x55d9593709c0] mmco: unref short failure +[h264 @ 0x55d9593709c0] mmco: unref short failure +[h264 @ 0x55d9560849c0] mmco: unref short failure +[h264 @ 0x55d9560849c0] mmco: unref short failure +[h264 @ 0x55d9593709c0] mmco: unref short failure +[h264 @ 0x55d9593709c0] mmco: unref short failure +[h264 @ 0x555dece38b80] mmco: unref short failure +[h264 @ 0x555dece38b80] mmco: unref short failure +[h264 @ 0x55d9593709c0] mmco: unref short failure +[h264 @ 0x55d9593709c0] mmco: unref short failure +[h264 @ 0x555dece38b80] mmco: unref short failure +[h264 @ 0x555dece38b80] mmco: unref short failure +[h264 @ 0x55d9593709c0] mmco: unref short failure +[h264 @ 0x555dece38b80] mmco: unref short failure + [2024-11-28 00:24:32] iteration 442/ 1000 | consumed samples: 28288 | elapsed time per iteration (ms): 65837.4 | throughput per GPU (TFLOP/s/GPU): 117.1 | learning rate: 3.123870E-06 | global batch size: 64 | lm loss: 6.873631E-01 | loss scale: 1.0 | grad norm: 0.968 | number of skipped iterations: 0 | number of nan iterations: 0 | + [2024-11-28 00:26:03] iteration 443/ 1000 | consumed samples: 28352 | elapsed time per iteration (ms): 91043.0 | throughput per GPU (TFLOP/s/GPU): 84.7 | learning rate: 3.116153E-06 | global batch size: 64 | lm loss: 6.662303E-01 | loss scale: 1.0 | grad norm: 0.792 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555deee52400] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x555deee52400] mmco: unref short failure +[h264 @ 0x555deee52400] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x555deee52400] mmco: unref short failure +[h264 @ 0x555deee52400] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x555deee52400] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x55d956042040] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x55d956042040] mmco: unref short failure +[h264 @ 0x55d956042040] mmco: unref short failure + [2024-11-28 00:27:20] iteration 444/ 1000 | consumed samples: 28416 | elapsed time per iteration (ms): 76727.9 | throughput per GPU (TFLOP/s/GPU): 100.5 | learning rate: 3.108430E-06 | global batch size: 64 | lm loss: 6.865659E-01 | loss scale: 1.0 | grad norm: 0.915 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x55d95827cc40] mmco: unref short failure +[h264 @ 0x55d95827cc40] mmco: unref short failure +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x55d95827cc40] mmco: unref short failure +[h264 @ 0x55d95827cc40] mmco: unref short failure +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x55d95827cc40] mmco: unref short failure +[h264 @ 0x55d95827cc40] mmco: unref short failure +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x55d955e4acc0] mmco: unref short failure +[h264 @ 0x555df0f7cd00] mmco: unref short failure +[h264 @ 0x55d95827cc40] mmco: unref short failure +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x55d95827cc40] mmco: unref short failure +[h264 @ 0x55d95827cc40] mmco: unref short failure +[h264 @ 0x55d95827cc40] mmco: unref short failure +[h264 @ 0x55d95827cc40] mmco: unref short failure +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x55d95827cc40] mmco: unref short failure +[h264 @ 0x55d95827cc40] mmco: unref short failure +[h264 @ 0x55d95827cc40] mmco: unref short failure +[h264 @ 0x55d95827cc40] mmco: unref short failure +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x55d95827cc40] mmco: unref short failure +[h264 @ 0x55d95827cc40] mmco: unref short failure +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x55d95827cc40] mmco: unref short failure +[h264 @ 0x55d95827cc40] mmco: unref short failure +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x55d95827cc40] mmco: unref short failure +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x55d955e4acc0] mmco: unref short failure +[h264 @ 0x55d955e4acc0] mmco: unref short failure +[h264 @ 0x555df0f7cd00] mmco: unref short failure +[h264 @ 0x555df0f7cd00] mmco: unref short failure +[h264 @ 0x55d955e4acc0] mmco: unref short failure +[h264 @ 0x555df0f7cd00] mmco: unref short failure +[h264 @ 0x555deee52400] mmco: unref short failure +[h264 @ 0x555deee52400] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x555deee52400] mmco: unref short failure +[h264 @ 0x555deee52400] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x55d957a1cec0] mmco: unref short failure +[h264 @ 0x55d957a1cec0] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x55d957a1cec0] mmco: unref short failure +[h264 @ 0x555dedfc6580] mmco: unref short failure +[h264 @ 0x555dedfc6580] mmco: unref short failure +[h264 @ 0x55d957fe7780] mmco: unref short failure +[h264 @ 0x55d957fe7780] mmco: unref short failure + [2024-11-28 00:28:42] iteration 445/ 1000 | consumed samples: 28480 | elapsed time per iteration (ms): 82107.0 | throughput per GPU (TFLOP/s/GPU): 93.9 | learning rate: 3.100701E-06 | global batch size: 64 | lm loss: 6.737128E-01 | loss scale: 1.0 | grad norm: 1.008 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555dee1d4480] mmco: unref short failure +[h264 @ 0x555dee1d4480] mmco: unref short failure +[h264 @ 0x555dee1d4480] mmco: unref short failure +[h264 @ 0x555dee1d4480] mmco: unref short failure +[h264 @ 0x55d955e4acc0] mmco: unref short failure +[h264 @ 0x55d955e4acc0] mmco: unref short failure +[h264 @ 0x55d955e4acc0] mmco: unref short failure +[h264 @ 0x55d955e4acc0] mmco: unref short failure + [2024-11-28 00:30:05] iteration 446/ 1000 | consumed samples: 28544 | elapsed time per iteration (ms): 83389.9 | throughput per GPU (TFLOP/s/GPU): 92.4 | learning rate: 3.092966E-06 | global batch size: 64 | lm loss: 6.630214E-01 | loss scale: 1.0 | grad norm: 0.971 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x55d957fbfd40] mmco: unref short failure +[h264 @ 0x555dec5a5140] mmco: unref short failure +[h264 @ 0x55d957fbfd40] mmco: unref short failure +[h264 @ 0x555dec5a5140] mmco: unref short failure +[h264 @ 0x55d957fbfd40] mmco: unref short failure +[h264 @ 0x555dec5a5140] mmco: unref short failure +[h264 @ 0x555dee1d4480] mmco: unref short failure +[h264 @ 0x55d955e4acc0] mmco: unref short failure + [2024-11-28 00:31:40] iteration 447/ 1000 | consumed samples: 28608 | elapsed time per iteration (ms): 94623.0 | throughput per GPU (TFLOP/s/GPU): 81.5 | learning rate: 3.085225E-06 | global batch size: 64 | lm loss: 6.231685E-01 | loss scale: 1.0 | grad norm: 1.500 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555dee1d4480] mmco: unref short failure +[h264 @ 0x55d955e4acc0] mmco: unref short failure + [2024-11-28 00:33:05] iteration 448/ 1000 | consumed samples: 28672 | elapsed time per iteration (ms): 85533.0 | throughput per GPU (TFLOP/s/GPU): 90.1 | learning rate: 3.077479E-06 | global batch size: 64 | lm loss: 6.156682E-01 | loss scale: 1.0 | grad norm: 0.758 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x55d955e4acc0] mmco: unref short failure +[h264 @ 0x555dec836c00] mmco: unref short failure +[h264 @ 0x555ded0b5480] mmco: unref short failure +[h264 @ 0x555ded0b5480] mmco: unref short failure +[h264 @ 0x55d956b4f800] mmco: unref short failure +[h264 @ 0x55d956b4f800] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x55d956042040] mmco: unref short failure +[h264 @ 0x55d956042040] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x55d956042040] mmco: unref short failure +[h264 @ 0x55d956042040] mmco: unref short failure +[h264 @ 0x55d956042040] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x55d956042040] mmco: unref short failure +[h264 @ 0x55d956042040] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x55d956042040] mmco: unref short failure +[h264 @ 0x55d956042040] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x55d956042040] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure + [2024-11-28 00:34:31] iteration 449/ 1000 | consumed samples: 28736 | elapsed time per iteration (ms): 85558.9 | throughput per GPU (TFLOP/s/GPU): 90.1 | learning rate: 3.069728E-06 | global batch size: 64 | lm loss: 6.721120E-01 | loss scale: 1.0 | grad norm: 0.901 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x55d9569edbc0] mmco: unref short failure +[h264 @ 0x55d9569edbc0] mmco: unref short failure +[h264 @ 0x555dee6b8180] mmco: unref short failure +[h264 @ 0x555dee6b8180] mmco: unref short failure +[h264 @ 0x555def216fc0] mmco: unref short failure +[h264 @ 0x555def216fc0] mmco: unref short failure +[h264 @ 0x55d9593709c0] mmco: unref short failure +[h264 @ 0x55d9593709c0] mmco: unref short failure +[h264 @ 0x555def216fc0] mmco: unref short failure +[h264 @ 0x555def216fc0] mmco: unref short failure +[h264 @ 0x55d9593709c0] mmco: unref short failure +[h264 @ 0x55d9593709c0] mmco: unref short failure +[h264 @ 0x555def216fc0] mmco: unref short failure +[h264 @ 0x555def216fc0] mmco: unref short failure +[h264 @ 0x55d9593709c0] mmco: unref short failure +[h264 @ 0x55d9593709c0] mmco: unref short failure +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x55d95b718d00] mmco: unref short failure +[h264 @ 0x55d95b718d00] mmco: unref short failure +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x55d95b718d00] mmco: unref short failure +[h264 @ 0x55d95b718d00] mmco: unref short failure + [2024-11-28 00:35:53] iteration 450/ 1000 | consumed samples: 28800 | elapsed time per iteration (ms): 82352.0 | throughput per GPU (TFLOP/s/GPU): 93.6 | learning rate: 3.061971E-06 | global batch size: 64 | lm loss: 6.951571E-01 | loss scale: 1.0 | grad norm: 0.834 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555def216fc0] mmco: unref short failure +[h264 @ 0x555def216fc0] mmco: unref short failure +[h264 @ 0x555def216fc0] mmco: unref short failure +[h264 @ 0x55d9593709c0] mmco: unref short failure +[h264 @ 0x55d9593709c0] mmco: unref short failure +[h264 @ 0x55d9593709c0] mmco: unref short failure + [2024-11-28 00:37:18] iteration 451/ 1000 | consumed samples: 28864 | elapsed time per iteration (ms): 84925.7 | throughput per GPU (TFLOP/s/GPU): 90.8 | learning rate: 3.054208E-06 | global batch size: 64 | lm loss: 6.865417E-01 | loss scale: 1.0 | grad norm: 0.906 | number of skipped iterations: 0 | number of nan iterations: 0 | + [2024-11-28 00:38:41] iteration 452/ 1000 | consumed samples: 28928 | elapsed time per iteration (ms): 82996.1 | throughput per GPU (TFLOP/s/GPU): 92.9 | learning rate: 3.046440E-06 | global batch size: 64 | lm loss: 6.977232E-01 | loss scale: 1.0 | grad norm: 0.946 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x55d95b718d00] mmco: unref short failure +[h264 @ 0x55d95b718d00] mmco: unref short failure +[h264 @ 0x555dec836c00] mmco: unref short failure +[h264 @ 0x55d955e4acc0] mmco: unref short failure +[h264 @ 0x555dec836c00] mmco: unref short failure +[h264 @ 0x555dec836c00] mmco: unref short failure +[h264 @ 0x55d955e4acc0] mmco: unref short failure +[h264 @ 0x55d955e4acc0] mmco: unref short failure +[h264 @ 0x55d95742d280] mmco: unref short failure +[h264 @ 0x555decea5e80] mmco: unref short failure + [2024-11-28 00:39:56] iteration 453/ 1000 | consumed samples: 28992 | elapsed time per iteration (ms): 74453.3 | throughput per GPU (TFLOP/s/GPU): 103.5 | learning rate: 3.038667E-06 | global batch size: 64 | lm loss: 5.958530E-01 | loss scale: 1.0 | grad norm: 0.829 | number of skipped iterations: 0 | number of nan iterations: 0 | + [2024-11-28 00:41:11] iteration 454/ 1000 | consumed samples: 29056 | elapsed time per iteration (ms): 75397.2 | throughput per GPU (TFLOP/s/GPU): 102.2 | learning rate: 3.030889E-06 | global batch size: 64 | lm loss: 6.823794E-01 | loss scale: 1.0 | grad norm: 0.788 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x55d956767080] mmco: unref short failure +[h264 @ 0x55d956767080] mmco: unref short failure +[h264 @ 0x555dedf89ec0] mmco: unref short failure +[h264 @ 0x555dedf89ec0] mmco: unref short failure +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x55d95707f900] mmco: unref short failure +[h264 @ 0x55d955b8bcc0] mmco: unref short failure +[h264 @ 0x555dee104680] mmco: unref short failure +[h264 @ 0x555dec836c00] mmco: unref short failure +[h264 @ 0x555dec836c00] mmco: unref short failure +[h264 @ 0x55d955e4acc0] mmco: unref short failure +[h264 @ 0x55d955e4acc0] mmco: unref short failure +[h264 @ 0x555dec836c00] mmco: unref short failure +[h264 @ 0x55d955e4acc0] mmco: unref short failure +[h264 @ 0x55d957f0f640] mmco: unref short failure +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x55d957f0f640] mmco: unref short failure +[h264 @ 0x55d957f0f640] mmco: unref short failure +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x55d957f0f640] mmco: unref short failure +[h264 @ 0x55d957f0f640] mmco: unref short failure +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x55d957f0f640] mmco: unref short failure +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x55d957f0f640] mmco: unref short failure +[h264 @ 0x55d957f0f640] mmco: unref short failure +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x55d957f0f640] mmco: unref short failure +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x55d957f0f640] mmco: unref short failure +[h264 @ 0x555dedf89ec0] mmco: unref short failure +[h264 @ 0x55d956767080] mmco: unref short failure +[h264 @ 0x55d955e4acc0] mmco: unref short failure +[h264 @ 0x55d955e4acc0] mmco: unref short failure +[h264 @ 0x555dec836c00] mmco: unref short failure +[h264 @ 0x555dec836c00] mmco: unref short failure +[h264 @ 0x555dee104680] mmco: unref short failure +[h264 @ 0x555dee104680] mmco: unref short failure +[h264 @ 0x55d955b8bcc0] mmco: unref short failure +[h264 @ 0x55d955b8bcc0] mmco: unref short failure +[h264 @ 0x555dee104680] mmco: unref short failure +[h264 @ 0x555dee104680] mmco: unref short failure +[h264 @ 0x55d955b8bcc0] mmco: unref short failure +[h264 @ 0x55d955b8bcc0] mmco: unref short failure +[h264 @ 0x555dee104680] mmco: unref short failure +[h264 @ 0x55d955b8bcc0] mmco: unref short failure + [2024-11-28 00:43:17] iteration 455/ 1000 | consumed samples: 29120 | elapsed time per iteration (ms): 125556.0 | throughput per GPU (TFLOP/s/GPU): 61.4 | learning rate: 3.023106E-06 | global batch size: 64 | lm loss: 6.797455E-01 | loss scale: 1.0 | grad norm: 0.848 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x55d95707f900] mmco: unref short failure +[h264 @ 0x55d95707f900] mmco: unref short failure +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x55d95707f900] mmco: unref short failure +[h264 @ 0x55d95707f900] mmco: unref short failure +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x55d95707f900] mmco: unref short failure +[h264 @ 0x55d95707f900] mmco: unref short failure +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x55d95707f900] mmco: unref short failure +[h264 @ 0x55d95707f900] mmco: unref short failure +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x55d95707f900] mmco: unref short failure +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x55d95707f900] mmco: unref short failure +[h264 @ 0x55d95707f900] mmco: unref short failure + [2024-11-28 00:44:48] iteration 456/ 1000 | consumed samples: 29184 | elapsed time per iteration (ms): 90955.3 | throughput per GPU (TFLOP/s/GPU): 84.8 | learning rate: 3.015318E-06 | global batch size: 64 | lm loss: 6.156820E-01 | loss scale: 1.0 | grad norm: 0.821 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555dece4d600] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x555dece4d600] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure + [2024-11-28 00:47:40] iteration 457/ 1000 | consumed samples: 29248 | elapsed time per iteration (ms): 171908.3 | throughput per GPU (TFLOP/s/GPU): 44.8 | learning rate: 3.007525E-06 | global batch size: 64 | lm loss: 7.012556E-01 | loss scale: 1.0 | grad norm: 0.957 | number of skipped iterations: 0 | number of nan iterations: 0 | + [2024-11-28 00:48:58] iteration 458/ 1000 | consumed samples: 29312 | elapsed time per iteration (ms): 78811.4 | throughput per GPU (TFLOP/s/GPU): 97.8 | learning rate: 2.999727E-06 | global batch size: 64 | lm loss: 5.902069E-01 | loss scale: 1.0 | grad norm: 0.790 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555dedf89ec0] mmco: unref short failure +[h264 @ 0x555dedf89ec0] mmco: unref short failure +[h264 @ 0x55d956767080] mmco: unref short failure +[h264 @ 0x55d956767080] mmco: unref short failure +[h264 @ 0x555dedf89ec0] mmco: unref short failure +[h264 @ 0x555dedf89ec0] mmco: unref short failure +[h264 @ 0x55d956767080] mmco: unref short failure +[h264 @ 0x55d956767080] mmco: unref short failure +[h264 @ 0x55d95c76c1c0] mmco: unref short failure +[h264 @ 0x555dee104680] mmco: unref short failure +[h264 @ 0x55d955d24640] mmco: unref short failure +[h264 @ 0x55d955d24640] mmco: unref short failure +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x55d955d24640] mmco: unref short failure +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x55d955d24640] mmco: unref short failure +[h264 @ 0x555dec9973c0] mmco: unref short failure + [2024-11-28 00:50:28] iteration 459/ 1000 | consumed samples: 29376 | elapsed time per iteration (ms): 89652.5 | throughput per GPU (TFLOP/s/GPU): 86.0 | learning rate: 2.991925E-06 | global batch size: 64 | lm loss: 6.764930E-01 | loss scale: 1.0 | grad norm: 0.903 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x55d956767080] mmco: unref short failure +[h264 @ 0x55d956767080] mmco: unref short failure +[h264 @ 0x555ded679600] mmco: unref short failure +[h264 @ 0x555ded679600] mmco: unref short failure +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x55d957a1cec0] mmco: unref short failure +[h264 @ 0x55d957a1cec0] mmco: unref short failure +[h264 @ 0x55d957a1cec0] mmco: unref short failure +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x555ded1fff00] mmco: unref short failure +[h264 @ 0x55d95707f900] mmco: unref short failure +[h264 @ 0x555ded1fff00] mmco: unref short failure +[h264 @ 0x55d95707f900] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x55d956767080] mmco: unref short failure +[h264 @ 0x55d956767080] mmco: unref short failure +[h264 @ 0x555ded679600] mmco: unref short failure +[h264 @ 0x555ded679600] mmco: unref short failure +[h264 @ 0x55d956767080] mmco: unref short failure +[h264 @ 0x55d956767080] mmco: unref short failure +[h264 @ 0x555ded679600] mmco: unref short failure +[h264 @ 0x555ded679600] mmco: unref short failure +[h264 @ 0x555ded1fff00] mmco: unref short failure +[h264 @ 0x55d95707f900] mmco: unref short failure +[h264 @ 0x555ded679600] mmco: unref short failure +[h264 @ 0x555ded679600] mmco: unref short failure +[h264 @ 0x55d956767080] mmco: unref short failure +[h264 @ 0x55d956767080] mmco: unref short failure + [2024-11-28 00:51:42] iteration 460/ 1000 | consumed samples: 29440 | elapsed time per iteration (ms): 73936.2 | throughput per GPU (TFLOP/s/GPU): 104.3 | learning rate: 2.984118E-06 | global batch size: 64 | lm loss: 6.732601E-01 | loss scale: 1.0 | grad norm: 0.856 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555dee104680] mmco: unref short failure +[h264 @ 0x55d95c76c1c0] mmco: unref short failure +[h264 @ 0x55d956ee5780] mmco: unref short failure +[h264 @ 0x555dec9973c0] mmco: unref short failure + [2024-11-28 00:53:14] iteration 461/ 1000 | consumed samples: 29504 | elapsed time per iteration (ms): 92328.7 | throughput per GPU (TFLOP/s/GPU): 83.5 | learning rate: 2.976306E-06 | global batch size: 64 | lm loss: 7.627915E-01 | loss scale: 1.0 | grad norm: 1.039 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x55d956042040] mmco: unref short failure +[h264 @ 0x55d956042040] mmco: unref short failure +[h264 @ 0x555dee6b8180] mmco: unref short failure +[h264 @ 0x555dee6b8180] mmco: unref short failure +[h264 @ 0x55d956042040] mmco: unref short failure +[h264 @ 0x555dee6b8180] mmco: unref short failure +[h264 @ 0x55d956042040] mmco: unref short failure +[h264 @ 0x555dee6b8180] mmco: unref short failure + [2024-11-28 00:55:17] iteration 462/ 1000 | consumed samples: 29568 | elapsed time per iteration (ms): 122529.1 | throughput per GPU (TFLOP/s/GPU): 62.9 | learning rate: 2.968490E-06 | global batch size: 64 | lm loss: 6.704935E-01 | loss scale: 1.0 | grad norm: 0.887 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x55d956f36e40] mmco: unref short failure +[h264 @ 0x55d956f36e40] mmco: unref short failure +[h264 @ 0x555dee104680] mmco: unref short failure +[h264 @ 0x55d95c76c1c0] mmco: unref short failure +[h264 @ 0x555dee104680] mmco: unref short failure +[h264 @ 0x55d95c76c1c0] mmco: unref short failure + [2024-11-28 00:57:22] iteration 463/ 1000 | consumed samples: 29632 | elapsed time per iteration (ms): 124583.0 | throughput per GPU (TFLOP/s/GPU): 61.9 | learning rate: 2.960670E-06 | global batch size: 64 | lm loss: 7.774192E-01 | loss scale: 1.0 | grad norm: 0.858 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555dedf1d500] mmco: unref short failure +[h264 @ 0x555dedf1d500] mmco: unref short failure +[h264 @ 0x55d956f36e40] mmco: unref short failure +[h264 @ 0x55d956f36e40] mmco: unref short failure +[h264 @ 0x555dee104680] mmco: unref short failure +[h264 @ 0x55d95c76c1c0] mmco: unref short failure +[h264 @ 0x555dedf1d500] mmco: unref short failure +[h264 @ 0x55d956767080] mmco: unref short failure +[h264 @ 0x555dedf1d500] mmco: unref short failure +[h264 @ 0x55d956767080] mmco: unref short failure +[h264 @ 0x555dedf1d500] mmco: unref short failure +[h264 @ 0x55d956767080] mmco: unref short failure +[h264 @ 0x555dedf1d500] mmco: unref short failure +[h264 @ 0x555dedf1d500] mmco: unref short failure +[h264 @ 0x55d956767080] mmco: unref short failure +[h264 @ 0x55d956767080] mmco: unref short failure + [2024-11-28 00:58:40] iteration 464/ 1000 | consumed samples: 29696 | elapsed time per iteration (ms): 78714.4 | throughput per GPU (TFLOP/s/GPU): 97.9 | learning rate: 2.952845E-06 | global batch size: 64 | lm loss: 6.651924E-01 | loss scale: 1.0 | grad norm: 0.995 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x55d957a03080] mmco: unref short failure +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x55d957a03080] mmco: unref short failure +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x55d957a03080] mmco: unref short failure +[h264 @ 0x55d957a03080] mmco: unref short failure +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x55d957a03080] mmco: unref short failure +[h264 @ 0x55d957a03080] mmco: unref short failure +[h264 @ 0x55d957f0f640] mmco: unref short failure +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x55d957f0f640] mmco: unref short failure +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x55d956f98f80] mmco: unref short failure +[h264 @ 0x55d955e4acc0] mmco: unref short failure +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x555df3e6f800] mmco: unref short failure + [2024-11-28 00:59:59] iteration 465/ 1000 | consumed samples: 29760 | elapsed time per iteration (ms): 78413.5 | throughput per GPU (TFLOP/s/GPU): 98.3 | learning rate: 2.945016E-06 | global batch size: 64 | lm loss: 6.635730E-01 | loss scale: 1.0 | grad norm: 0.916 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x55d955d4f640] mmco: unref short failure +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x55d955d4f640] mmco: unref short failure +[h264 @ 0x55d955d4f640] mmco: unref short failure +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x55d956f98f80] mmco: unref short failure +[h264 @ 0x55d956f98f80] mmco: unref short failure +[h264 @ 0x555df3e6f800] mmco: unref short failure +[h264 @ 0x555df3e6f800] mmco: unref short failure +[h264 @ 0x55d956f98f80] mmco: unref short failure +[h264 @ 0x55d956f98f80] mmco: unref short failure +[h264 @ 0x555df3e6f800] mmco: unref short failure +[h264 @ 0x555df3e6f800] mmco: unref short failure +[h264 @ 0x555df3e6f800] mmco: unref short failure +[h264 @ 0x55d956f98f80] mmco: unref short failure + [2024-11-28 01:01:23] iteration 466/ 1000 | consumed samples: 29824 | elapsed time per iteration (ms): 83878.6 | throughput per GPU (TFLOP/s/GPU): 91.9 | learning rate: 2.937183E-06 | global batch size: 64 | lm loss: 6.822602E-01 | loss scale: 1.0 | grad norm: 0.763 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x55d956f98f80] mmco: unref short failure +[h264 @ 0x555df3e6f800] mmco: unref short failure +[h264 @ 0x55d956f98f80] mmco: unref short failure +[h264 @ 0x55d956f98f80] mmco: unref short failure +[h264 @ 0x555df3e6f800] mmco: unref short failure +[h264 @ 0x555df3e6f800] mmco: unref short failure +[h264 @ 0x55d956f98f80] mmco: unref short failure +[h264 @ 0x55d956f98f80] mmco: unref short failure +[h264 @ 0x555df3e6f800] mmco: unref short failure +[h264 @ 0x555df3e6f800] mmco: unref short failure +[h264 @ 0x555df3e6f800] mmco: unref short failure +[h264 @ 0x555df3e6f800] mmco: unref short failure +[h264 @ 0x55d956f98f80] mmco: unref short failure +[h264 @ 0x55d956f98f80] mmco: unref short failure + [2024-11-28 01:03:07] iteration 467/ 1000 | consumed samples: 29888 | elapsed time per iteration (ms): 104690.7 | throughput per GPU (TFLOP/s/GPU): 73.6 | learning rate: 2.929345E-06 | global batch size: 64 | lm loss: 6.986808E-01 | loss scale: 1.0 | grad norm: 1.090 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x55d956b88f40] mmco: unref short failure +[h264 @ 0x55d95707f900] mmco: unref short failure +[h264 @ 0x555dedf05880] mmco: unref short failure + [2024-11-28 01:04:23] iteration 468/ 1000 | consumed samples: 29952 | elapsed time per iteration (ms): 75443.1 | throughput per GPU (TFLOP/s/GPU): 102.2 | learning rate: 2.921504E-06 | global batch size: 64 | lm loss: 6.062692E-01 | loss scale: 1.0 | grad norm: 0.860 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x55d957f0f640] mmco: unref short failure +[h264 @ 0x55d957f0f640] mmco: unref short failure +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x55d957f0f640] mmco: unref short failure +[h264 @ 0x55d957f0f640] mmco: unref short failure +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x55d957f0f640] mmco: unref short failure +[h264 @ 0x555def216fc0] mmco: unref short failure +[h264 @ 0x55d9560849c0] mmco: unref short failure +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x555def216fc0] mmco: unref short failure +[h264 @ 0x55d9560849c0] mmco: unref short failure +[h264 @ 0x55d956b88f40] mmco: unref short failure +[h264 @ 0x55d95b718d00] mmco: unref short failure +[h264 @ 0x55d95b718d00] mmco: unref short failure +[h264 @ 0x555df32c1f40] mmco: unref short failure +[h264 @ 0x555df32c1f40] mmco: unref short failure + [2024-11-28 01:05:45] iteration 469/ 1000 | consumed samples: 30016 | elapsed time per iteration (ms): 82693.9 | throughput per GPU (TFLOP/s/GPU): 93.2 | learning rate: 2.913659E-06 | global batch size: 64 | lm loss: 6.660897E-01 | loss scale: 1.0 | grad norm: 0.890 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555dedfc6580] mmco: unref short failure +[h264 @ 0x55d95742d280] mmco: unref short failure +[h264 @ 0x555dedfc6580] mmco: unref short failure +[h264 @ 0x555dedfc6580] mmco: unref short failure +[h264 @ 0x55d95742d280] mmco: unref short failure +[h264 @ 0x55d95742d280] mmco: unref short failure + [2024-11-28 01:08:12] iteration 470/ 1000 | consumed samples: 30080 | elapsed time per iteration (ms): 147025.3 | throughput per GPU (TFLOP/s/GPU): 52.4 | learning rate: 2.905810E-06 | global batch size: 64 | lm loss: 6.328305E-01 | loss scale: 1.0 | grad norm: 0.839 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x55d95873d100] mmco: unref short failure +[h264 @ 0x55d95873d100] mmco: unref short failure +[h264 @ 0x555defa21280] mmco: unref short failure +[h264 @ 0x555defa21280] mmco: unref short failure +[h264 @ 0x55d956f36840] mmco: unref short failure +[h264 @ 0x555df32c1f40] mmco: unref short failure +[h264 @ 0x55d95873d100] mmco: unref short failure +[h264 @ 0x555defa21280] mmco: unref short failure +[h264 @ 0x55d95873d100] mmco: unref short failure +[h264 @ 0x555defa21280] mmco: unref short failure +[h264 @ 0x55d95873d100] mmco: unref short failure +[h264 @ 0x555defa21280] mmco: unref short failure +[h264 @ 0x55d95873d100] mmco: unref short failure +[h264 @ 0x555defa21280] mmco: unref short failure +[h264 @ 0x55d95873d100] mmco: unref short failure +[h264 @ 0x55d95873d100] mmco: unref short failure +[h264 @ 0x555defa21280] mmco: unref short failure +[h264 @ 0x555defa21280] mmco: unref short failure +[h264 @ 0x555df32c1f40] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x55d9560849c0] mmco: unref short failure +[h264 @ 0x555dee6ec240] mmco: unref short failure +[h264 @ 0x55d9560849c0] mmco: unref short failure +[h264 @ 0x55d9560849c0] mmco: unref short failure +[h264 @ 0x555dee6ec240] mmco: unref short failure +[h264 @ 0x555dee6ec240] mmco: unref short failure +[h264 @ 0x55d9560849c0] mmco: unref short failure +[h264 @ 0x55d9560849c0] mmco: unref short failure +[h264 @ 0x555dee6ec240] mmco: unref short failure +[h264 @ 0x555dee6ec240] mmco: unref short failure +[h264 @ 0x55d9560849c0] mmco: unref short failure +[h264 @ 0x55d9560849c0] mmco: unref short failure +[h264 @ 0x555dee6ec240] mmco: unref short failure +[h264 @ 0x555dee6ec240] mmco: unref short failure + [2024-11-28 01:09:25] iteration 471/ 1000 | consumed samples: 30144 | elapsed time per iteration (ms): 72860.1 | throughput per GPU (TFLOP/s/GPU): 105.8 | learning rate: 2.897957E-06 | global batch size: 64 | lm loss: 6.858408E-01 | loss scale: 1.0 | grad norm: 0.842 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555df4961e80] mmco: unref short failure +[h264 @ 0x555df4961e80] mmco: unref short failure +[h264 @ 0x555deed21240] mmco: unref short failure +[h264 @ 0x55d959b34680] mmco: unref short failure +[h264 @ 0x55d959b34680] mmco: unref short failure +[h264 @ 0x55d956b9c3c0] mmco: unref short failure + [2024-11-28 01:11:13] iteration 472/ 1000 | consumed samples: 30208 | elapsed time per iteration (ms): 107941.5 | throughput per GPU (TFLOP/s/GPU): 71.4 | learning rate: 2.890101E-06 | global batch size: 64 | lm loss: 7.221738E-01 | loss scale: 1.0 | grad norm: 1.071 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x55d959b34680] mmco: unref short failure +[h264 @ 0x55d959b34680] mmco: unref short failure +[h264 @ 0x555df4961e80] mmco: unref short failure +[h264 @ 0x555df4961e80] mmco: unref short failure +[h264 @ 0x55d9560849c0] mmco: unref short failure +[h264 @ 0x555dee6ec240] mmco: unref short failure +[h264 @ 0x55d9560849c0] mmco: unref short failure +[h264 @ 0x555dee6ec240] mmco: unref short failure +[h264 @ 0x55d9560849c0] mmco: unref short failure +[h264 @ 0x555dee6ec240] mmco: unref short failure + [2024-11-28 01:12:40] iteration 473/ 1000 | consumed samples: 30272 | elapsed time per iteration (ms): 86691.0 | throughput per GPU (TFLOP/s/GPU): 88.9 | learning rate: 2.882241E-06 | global batch size: 64 | lm loss: 6.783547E-01 | loss scale: 1.0 | grad norm: 0.937 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x55d9569edbc0] mmco: unref short failure +[h264 @ 0x55d9569edbc0] mmco: unref short failure +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x55d9569edbc0] mmco: unref short failure + [2024-11-28 01:14:19] iteration 474/ 1000 | consumed samples: 30336 | elapsed time per iteration (ms): 98989.4 | throughput per GPU (TFLOP/s/GPU): 77.9 | learning rate: 2.874378E-06 | global batch size: 64 | lm loss: 6.476663E-01 | loss scale: 1.0 | grad norm: 1.062 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555dee100dc0] mmco: unref short failure +[h264 @ 0x555dee100dc0] mmco: unref short failure +[h264 @ 0x55d956f537c0] mmco: unref short failure +[h264 @ 0x55d956f537c0] mmco: unref short failure +[h264 @ 0x555dee100dc0] mmco: unref short failure +[h264 @ 0x555dee100dc0] mmco: unref short failure +[h264 @ 0x55d956f537c0] mmco: unref short failure +[h264 @ 0x55d956f537c0] mmco: unref short failure +[h264 @ 0x555df4961e80] mmco: unref short failure +[h264 @ 0x55d959cd93c0] mmco: unref short failure + [2024-11-28 01:15:46] iteration 475/ 1000 | consumed samples: 30400 | elapsed time per iteration (ms): 86919.5 | throughput per GPU (TFLOP/s/GPU): 88.7 | learning rate: 2.866511E-06 | global batch size: 64 | lm loss: 7.588748E-01 | loss scale: 1.0 | grad norm: 1.063 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555df4961e80] mmco: unref short failure +[h264 @ 0x555df4961e80] mmco: unref short failure +[h264 @ 0x55d959cd93c0] mmco: unref short failure +[h264 @ 0x55d959cd93c0] mmco: unref short failure +[h264 @ 0x555df32c1f40] mmco: unref short failure +[h264 @ 0x555df32c1f40] mmco: unref short failure +[h264 @ 0x555df32c1f40] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x555dee100dc0] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x55d956b9c3c0] mmco: unref short failure +[h264 @ 0x555df32c1f40] mmco: unref short failure +[h264 @ 0x555df32c1f40] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x555df32c1f40] mmco: unref short failure +[h264 @ 0x555df32c1f40] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x555dedf05880] mmco: unref short failure +[h264 @ 0x555dedf05880] mmco: unref short failure +[h264 @ 0x55d956f36e40] mmco: unref short failure +[h264 @ 0x55d956f36e40] mmco: unref short failure +[h264 @ 0x555dedf05880] mmco: unref short failure +[h264 @ 0x555dedf05880] mmco: unref short failure +[h264 @ 0x555dedf05880] mmco: unref short failure +[h264 @ 0x555dedf05880] mmco: unref short failure +[h264 @ 0x555dedf05880] mmco: unref short failure +[h264 @ 0x55d956f36e40] mmco: unref short failure +[h264 @ 0x55d956f36e40] mmco: unref short failure +[h264 @ 0x55d956f36e40] mmco: unref short failure +[h264 @ 0x55d956f36e40] mmco: unref short failure +[h264 @ 0x55d956f36e40] mmco: unref short failure +[h264 @ 0x555dedf05880] mmco: unref short failure +[h264 @ 0x555dedf05880] mmco: unref short failure +[h264 @ 0x55d956f36e40] mmco: unref short failure +[h264 @ 0x55d956f36e40] mmco: unref short failure +[h264 @ 0x555dedb7a200] mmco: unref short failure +[h264 @ 0x555dedb7a200] mmco: unref short failure +[h264 @ 0x55d955cbb4c0] mmco: unref short failure +[h264 @ 0x55d955cbb4c0] mmco: unref short failure + [2024-11-28 01:17:37] iteration 476/ 1000 | consumed samples: 30464 | elapsed time per iteration (ms): 110927.6 | throughput per GPU (TFLOP/s/GPU): 69.5 | learning rate: 2.858641E-06 | global batch size: 64 | lm loss: 6.688645E-01 | loss scale: 1.0 | grad norm: 0.883 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x55d955cbb4c0] mmco: unref short failure +[h264 @ 0x55d955cbb4c0] mmco: unref short failure +[h264 @ 0x555dedb7a200] mmco: unref short failure +[h264 @ 0x555dedb7a200] mmco: unref short failure +[h264 @ 0x555dedb7a200] mmco: unref short failure +[h264 @ 0x555dedb7a200] mmco: unref short failure +[h264 @ 0x55d955cbb4c0] mmco: unref short failure +[h264 @ 0x55d955cbb4c0] mmco: unref short failure +[h264 @ 0x555dedb7a200] mmco: unref short failure +[h264 @ 0x555dedb7a200] mmco: unref short failure +[h264 @ 0x55d955cbb4c0] mmco: unref short failure +[h264 @ 0x55d955cbb4c0] mmco: unref short failure + [2024-11-28 01:19:10] iteration 477/ 1000 | consumed samples: 30528 | elapsed time per iteration (ms): 92831.5 | throughput per GPU (TFLOP/s/GPU): 83.0 | learning rate: 2.850767E-06 | global batch size: 64 | lm loss: 6.710600E-01 | loss scale: 1.0 | grad norm: 0.973 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555dee100dc0] mmco: unref short failure +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x55d9569edbc0] mmco: unref short failure +[h264 @ 0x55d956f36e40] mmco: unref short failure + [2024-11-28 01:20:48] iteration 478/ 1000 | consumed samples: 30592 | elapsed time per iteration (ms): 98092.4 | throughput per GPU (TFLOP/s/GPU): 78.6 | learning rate: 2.842891E-06 | global batch size: 64 | lm loss: 5.867029E-01 | loss scale: 1.0 | grad norm: 1.056 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555df4961e80] mmco: unref short failure +[h264 @ 0x55d959b34680] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x555df32c1f40] mmco: unref short failure +[h264 @ 0x555df32c1f40] mmco: unref short failure +[h264 @ 0x555ded679600] mmco: unref short failure +[h264 @ 0x555ded679600] mmco: unref short failure +[h264 @ 0x55d9560849c0] mmco: unref short failure +[h264 @ 0x55d9560849c0] mmco: unref short failure +[h264 @ 0x555ded679600] mmco: unref short failure +[h264 @ 0x555ded679600] mmco: unref short failure +[h264 @ 0x55d9560849c0] mmco: unref short failure +[h264 @ 0x55d9560849c0] mmco: unref short failure +[h264 @ 0x55d9560849c0] [h264 @ 0x555ded679600] mmco: unref short failure +mmco: unref short failure +[h264 @ 0x555ded679600] mmco: unref short failure +[h264 @ 0x55d9560849c0] mmco: unref short failure +[h264 @ 0x555ded679600] mmco: unref short failure +[h264 @ 0x55d9560849c0] mmco: unref short failure + [2024-11-28 01:22:14] iteration 479/ 1000 | consumed samples: 30656 | elapsed time per iteration (ms): 86072.4 | throughput per GPU (TFLOP/s/GPU): 89.6 | learning rate: 2.835011E-06 | global batch size: 64 | lm loss: 6.228137E-01 | loss scale: 1.0 | grad norm: 0.979 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555dee100dc0] mmco: unref short failure +[h264 @ 0x555dee100dc0] mmco: unref short failure +[h264 @ 0x55d956f36e40] mmco: unref short failure +[h264 @ 0x55d956f36e40] mmco: unref short failure +[h264 @ 0x55d956f36e40] mmco: unref short failure +[h264 @ 0x55d956f36e40] mmco: unref short failure +[h264 @ 0x555dee100dc0] mmco: unref short failure +[h264 @ 0x555dee100dc0] mmco: unref short failure + [2024-11-28 01:23:43] iteration 480/ 1000 | consumed samples: 30720 | elapsed time per iteration (ms): 88842.9 | throughput per GPU (TFLOP/s/GPU): 86.8 | learning rate: 2.827129E-06 | global batch size: 64 | lm loss: 6.328573E-01 | loss scale: 1.0 | grad norm: 0.946 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555dec836c00] mmco: unref short failure +[h264 @ 0x55d957fe7780] mmco: unref short failure +[h264 @ 0x555dec836c00] mmco: unref short failure +[h264 @ 0x55d957fe7780] mmco: unref short failure +[h264 @ 0x55d957fe7780] mmco: unref short failure +[h264 @ 0x55d957fe7780] mmco: unref short failure +[h264 @ 0x555dec836c00] mmco: unref short failure +[h264 @ 0x555dec836c00] mmco: unref short failure +[h264 @ 0x555dece4d600] mmco: unref short failure +[h264 @ 0x55d95742d280] mmco: unref short failure + [2024-11-28 01:26:05] iteration 481/ 1000 | consumed samples: 30784 | elapsed time per iteration (ms): 141791.4 | throughput per GPU (TFLOP/s/GPU): 54.4 | learning rate: 2.819243E-06 | global batch size: 64 | lm loss: 6.670113E-01 | loss scale: 1.0 | grad norm: 0.821 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555dee100dc0] mmco: unref short failure +[h264 @ 0x55d956aae040] mmco: unref short failure +[h264 @ 0x555dee100dc0] mmco: unref short failure +[h264 @ 0x555dee100dc0] mmco: unref short failure +[h264 @ 0x55d956aae040] mmco: unref short failure +[h264 @ 0x55d956aae040] mmco: unref short failure +[h264 @ 0x555dee100dc0] mmco: unref short failure +[h264 @ 0x55d956aae040] mmco: unref short failure +[h264 @ 0x55d956f36e40] mmco: unref short failure +[h264 @ 0x55d956f36e40] mmco: unref short failure +[h264 @ 0x555df4961e80] mmco: unref short failure +[h264 @ 0x555df4961e80] mmco: unref short failure + [2024-11-28 01:27:31] iteration 482/ 1000 | consumed samples: 30848 | elapsed time per iteration (ms): 86684.9 | throughput per GPU (TFLOP/s/GPU): 88.9 | learning rate: 2.811355E-06 | global batch size: 64 | lm loss: 7.225530E-01 | loss scale: 1.0 | grad norm: 1.018 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x55d956ee5780] mmco: unref short failure +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x555ded696c00] mmco: unref short failure +[h264 @ 0x555ded696c00] mmco: unref short failure +[h264 @ 0x55d956aae040] mmco: unref short failure +[h264 @ 0x55d956aae040] mmco: unref short failure + [2024-11-28 01:29:31] iteration 483/ 1000 | consumed samples: 30912 | elapsed time per iteration (ms): 119464.5 | throughput per GPU (TFLOP/s/GPU): 64.5 | learning rate: 2.803464E-06 | global batch size: 64 | lm loss: 6.893146E-01 | loss scale: 1.0 | grad norm: 0.940 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x55d95d112a80] mmco: unref short failure +[h264 @ 0x55d95d112a80] mmco: unref short failure +[h264 @ 0x555df4961e80] mmco: unref short failure +[h264 @ 0x555df4961e80] mmco: unref short failure +[h264 @ 0x55d95d112a80] mmco: unref short failure +[h264 @ 0x555df4961e80] mmco: unref short failure +[h264 @ 0x55d95d112a80] mmco: unref short failure +[h264 @ 0x55d95d112a80] mmco: unref short failure +[h264 @ 0x555df4961e80] mmco: unref short failure +[h264 @ 0x555df4961e80] mmco: unref short failure +[h264 @ 0x555dee100dc0] mmco: unref short failure +[h264 @ 0x55d956aae040] mmco: unref short failure +[h264 @ 0x55d956aae040] mmco: unref short failure +[h264 @ 0x555dee100dc0] mmco: unref short failure +[h264 @ 0x55d956aae040] mmco: unref short failure +[h264 @ 0x55d956aae040] mmco: unref short failure +[h264 @ 0x555dee100dc0] mmco: unref short failure +[h264 @ 0x555dee100dc0] mmco: unref short failure +[h264 @ 0x555dee104680] mmco: unref short failure +[h264 @ 0x555dee104680] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x55d956aae040] mmco: unref short failure +[h264 @ 0x555dee100dc0] mmco: unref short failure +[h264 @ 0x55d956aae040] mmco: unref short failure +[h264 @ 0x555dee100dc0] mmco: unref short failure +[h264 @ 0x555ded696c00] mmco: unref short failure +[h264 @ 0x555ded696c00] mmco: unref short failure +[h264 @ 0x55d956aae040] mmco: unref short failure +[h264 @ 0x55d956aae040] mmco: unref short failure +[h264 @ 0x555df4961e80] mmco: unref short failure +[h264 @ 0x55d956f36e40] mmco: unref short failure + [2024-11-28 01:30:36] iteration 484/ 1000 | consumed samples: 30976 | elapsed time per iteration (ms): 65499.5 | throughput per GPU (TFLOP/s/GPU): 117.7 | learning rate: 2.795570E-06 | global batch size: 64 | lm loss: 6.424096E-01 | loss scale: 1.0 | grad norm: 0.867 | number of skipped iterations: 0 | number of nan iterations: 0 | + [2024-11-28 01:31:52] iteration 485/ 1000 | consumed samples: 31040 | elapsed time per iteration (ms): 75993.8 | throughput per GPU (TFLOP/s/GPU): 101.4 | learning rate: 2.787674E-06 | global batch size: 64 | lm loss: 6.655073E-01 | loss scale: 1.0 | grad norm: 0.748 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555df4961e80] mmco: unref short failure +[h264 @ 0x55d956f36e40] mmco: unref short failure +[h264 @ 0x555dec836c00] mmco: unref short failure +[h264 @ 0x555dec836c00] mmco: unref short failure +[h264 @ 0x55d957fe7780] mmco: unref short failure +[h264 @ 0x55d957fe7780] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x55d9565f0700] mmco: unref short failure +[h264 @ 0x55d9565f0700] mmco: unref short failure +[h264 @ 0x555dedfc6580] mmco: unref short failure +[h264 @ 0x55d9586ad200] mmco: unref short failure +[h264 @ 0x555dec836c00] mmco: unref short failure +[h264 @ 0x555dec836c00] mmco: unref short failure +[h264 @ 0x55d957fe7780] mmco: unref short failure +[h264 @ 0x55d957fe7780] mmco: unref short failure +[h264 @ 0x555dedfc6580] mmco: unref short failure +[h264 @ 0x555dedfc6580] mmco: unref short failure +[h264 @ 0x55d9586ad200] mmco: unref short failure +[h264 @ 0x55d9586ad200] mmco: unref short failure +[h264 @ 0x555dedfc6580] mmco: unref short failure +[h264 @ 0x555dec836c00] mmco: unref short failure +[h264 @ 0x555dec836c00] mmco: unref short failure +[h264 @ 0x55d957fe7780] mmco: unref short failure +[h264 @ 0x55d957fe7780] mmco: unref short failure +[h264 @ 0x55d956767080] mmco: unref short failure +[h264 @ 0x55d957fe7780] mmco: unref short failure +[h264 @ 0x555dec836c00] mmco: unref short failure +[h264 @ 0x55d957fe7780] mmco: unref short failure +[h264 @ 0x555dec836c00] mmco: unref short failure +[h264 @ 0x55d957fe7780] mmco: unref short failure +[h264 @ 0x55d957fe7780] mmco: unref short failure +[h264 @ 0x555dec836c00] mmco: unref short failure +[h264 @ 0x555dec836c00] mmco: unref short failure +[h264 @ 0x55d957fe7780] mmco: unref short failure +[h264 @ 0x55d957fe7780] mmco: unref short failure +[h264 @ 0x555dec836c00] mmco: unref short failure +[h264 @ 0x555dec836c00] mmco: unref short failure + [2024-11-28 01:33:34] iteration 486/ 1000 | consumed samples: 31104 | elapsed time per iteration (ms): 101309.3 | throughput per GPU (TFLOP/s/GPU): 76.1 | learning rate: 2.779775E-06 | global batch size: 64 | lm loss: 6.528612E-01 | loss scale: 1.0 | grad norm: 0.929 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555dedfc6580] mmco: unref short failure +[h264 @ 0x55d956767080] mmco: unref short failure +[h264 @ 0x555dedfc6580] mmco: unref short failure +[h264 @ 0x555dedfc6580] mmco: unref short failure +[h264 @ 0x55d956767080] mmco: unref short failure +[h264 @ 0x55d956767080] mmco: unref short failure +[h264 @ 0x55d956767080] mmco: unref short failure +[h264 @ 0x555dedfc6580] mmco: unref short failure +[h264 @ 0x55d956767080] mmco: unref short failure +[h264 @ 0x55d956767080] mmco: unref short failure +[h264 @ 0x555dedfc6580] mmco: unref short failure +[h264 @ 0x555dedfc6580] mmco: unref short failure +[h264 @ 0x55d956767080] mmco: unref short failure +[h264 @ 0x555dedfc6580] mmco: unref short failure +[h264 @ 0x55d956767080] mmco: unref short failure +[h264 @ 0x55d956767080] mmco: unref short failure +[h264 @ 0x555dedfc6580] mmco: unref short failure +[h264 @ 0x555dedfc6580] mmco: unref short failure +[h264 @ 0x555df4961e80] mmco: unref short failure +[h264 @ 0x555df4961e80] mmco: unref short failure +[h264 @ 0x55d9575fd4c0] mmco: unref short failure +[h264 @ 0x55d9575fd4c0] mmco: unref short failure +[h264 @ 0x55d9586ad200] mmco: unref short failure +[h264 @ 0x55d9586ad200] mmco: unref short failure +[h264 @ 0x555dedfc6580] mmco: unref short failure +[h264 @ 0x555dedfc6580] mmco: unref short failure +[h264 @ 0x555df4961e80] mmco: unref short failure +[h264 @ 0x55d9575fd4c0] mmco: unref short failure +[h264 @ 0x555df4961e80] mmco: unref short failure +[h264 @ 0x555df4961e80] mmco: unref short failure +[h264 @ 0x55d9575fd4c0] mmco: unref short failure +[h264 @ 0x55d9575fd4c0] mmco: unref short failure +[h264 @ 0x555ded696c00] mmco: unref short failure +[h264 @ 0x555ded696c00] mmco: unref short failure +[h264 @ 0x555ded696c00] mmco: unref short failure +[h264 @ 0x55d956aae040] mmco: unref short failure +[h264 @ 0x55d956aae040] mmco: unref short failure +[h264 @ 0x55d956aae040] mmco: unref short failure +[h264 @ 0x555ded696c00] mmco: unref short failure +[h264 @ 0x555ded696c00] mmco: unref short failure +[h264 @ 0x55d956aae040] mmco: unref short failure +[h264 @ 0x55d956aae040] mmco: unref short failure +[h264 @ 0x555ded696c00] mmco: unref short failure +[h264 @ 0x555ded696c00] mmco: unref short failure +[h264 @ 0x55d956aae040] mmco: unref short failure +[h264 @ 0x55d956aae040] mmco: unref short failure +[h264 @ 0x555ded696c00] mmco: unref short failure +[h264 @ 0x555ded696c00] mmco: unref short failure +[h264 @ 0x55d956aae040] mmco: unref short failure +[h264 @ 0x55d956aae040] mmco: unref short failure +[h264 @ 0x555ded696c00] mmco: unref short failure +[h264 @ 0x555ded696c00] mmco: unref short failure +[h264 @ 0x55d956aae040] mmco: unref short failure +[h264 @ 0x55d956aae040] mmco: unref short failure + [2024-11-28 01:34:47] iteration 487/ 1000 | consumed samples: 31168 | elapsed time per iteration (ms): 73484.9 | throughput per GPU (TFLOP/s/GPU): 104.9 | learning rate: 2.771874E-06 | global batch size: 64 | lm loss: 6.569068E-01 | loss scale: 1.0 | grad norm: 0.904 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555dedfc6580] mmco: unref short failure +[h264 @ 0x55d9586ad200] mmco: unref short failure +[h264 @ 0x555dedfc6580] mmco: unref short failure +[h264 @ 0x55d9586ad200] mmco: unref short failure +[h264 @ 0x555dedfc6580] mmco: unref short failure +[h264 @ 0x555dedfc6580] mmco: unref short failure +[h264 @ 0x55d9586ad200] mmco: unref short failure +[h264 @ 0x55d9586ad200] mmco: unref short failure +[h264 @ 0x555dedfc6580] mmco: unref short failure +[h264 @ 0x555dedfc6580] mmco: unref short failure +[h264 @ 0x555dedfc6580] mmco: unref short failure +[h264 @ 0x55d9586ad200] mmco: unref short failure +[h264 @ 0x55d9586ad200] mmco: unref short failure +[h264 @ 0x55d9586ad200] mmco: unref short failure +[h264 @ 0x555dedfc6580] mmco: unref short failure +[h264 @ 0x55d9586ad200] mmco: unref short failure + [2024-11-28 01:36:11] iteration 488/ 1000 | consumed samples: 31232 | elapsed time per iteration (ms): 84370.3 | throughput per GPU (TFLOP/s/GPU): 91.4 | learning rate: 2.763971E-06 | global batch size: 64 | lm loss: 6.942212E-01 | loss scale: 1.0 | grad norm: 0.966 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555dee1d4480] mmco: unref short failure +[h264 @ 0x55d95c76c1c0] mmco: unref short failure +[h264 @ 0x555df4961e80] mmco: unref short failure +[h264 @ 0x55d95d112a80] mmco: unref short failure +[h264 @ 0x55d956ee5780] mmco: unref short failure +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x55d956ee5780] mmco: unref short failure +[h264 @ 0x555dece9f180] mmco: unref short failure +[h264 @ 0x55d956f98f80] mmco: unref short failure +[h264 @ 0x555df4961e80] mmco: unref short failure +[h264 @ 0x555df4961e80] mmco: unref short failure +[h264 @ 0x55d956f7b580] mmco: unref short failure +[h264 @ 0x55d956f7b580] mmco: unref short failure +[h264 @ 0x555dec836c00] mmco: unref short failure +[h264 @ 0x555dec836c00] mmco: unref short failure +[h264 @ 0x55d957fe7780] mmco: unref short failure +[h264 @ 0x55d957fe7780] mmco: unref short failure +[h264 @ 0x555dedf05880] mmco: unref short failure +[h264 @ 0x555dedf05880] mmco: unref short failure +[h264 @ 0x55d959cd93c0] mmco: unref short failure +[h264 @ 0x55d959cd93c0] mmco: unref short failure +[h264 @ 0x555dedf05880] mmco: unref short failure +[h264 @ 0x555dedf05880] mmco: unref short failure +[h264 @ 0x555dedf05880] mmco: unref short failure +[h264 @ 0x555dedf05880] mmco: unref short failure +[h264 @ 0x55d959cd93c0] mmco: unref short failure +[h264 @ 0x55d959cd93c0] mmco: unref short failure +[h264 @ 0x55d959cd93c0] mmco: unref short failure +[h264 @ 0x55d959cd93c0] mmco: unref short failure +[h264 @ 0x555dedf05880] mmco: unref short failure +[h264 @ 0x555dedf05880] mmco: unref short failure +[h264 @ 0x55d959cd93c0] mmco: unref short failure +[h264 @ 0x55d959cd93c0] mmco: unref short failure +[h264 @ 0x555dedf05880] mmco: unref short failure +[h264 @ 0x555dedf05880] mmco: unref short failure +[h264 @ 0x55d959cd93c0] mmco: unref short failure +[h264 @ 0x55d959cd93c0] mmco: unref short failure +[h264 @ 0x555dedf05880] mmco: unref short failure +[h264 @ 0x555dedf05880] mmco: unref short failure +[h264 @ 0x555dedf05880] mmco: unref short failure +[h264 @ 0x555dedf05880] mmco: unref short failure +[h264 @ 0x55d959cd93c0] mmco: unref short failure +[h264 @ 0x55d959cd93c0] mmco: unref short failure +[h264 @ 0x55d959cd93c0] mmco: unref short failure +[h264 @ 0x55d959cd93c0] mmco: unref short failure + [2024-11-28 01:37:28] iteration 489/ 1000 | consumed samples: 31296 | elapsed time per iteration (ms): 76239.9 | throughput per GPU (TFLOP/s/GPU): 101.1 | learning rate: 2.756065E-06 | global batch size: 64 | lm loss: 6.821255E-01 | loss scale: 1.0 | grad norm: 0.919 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555ded7e1040] mmco: unref short failure +[h264 @ 0x555ded7e1040] mmco: unref short failure +[h264 @ 0x55d957a1cec0] mmco: unref short failure +[h264 @ 0x55d957a1cec0] mmco: unref short failure +[h264 @ 0x555ded7e1040] mmco: unref short failure +[h264 @ 0x55d957a1cec0] mmco: unref short failure +[h264 @ 0x555dec836c00] mmco: unref short failure +[h264 @ 0x555dec836c00] mmco: unref short failure +[h264 @ 0x55d957fe7780] mmco: unref short failure +[h264 @ 0x55d957fe7780] mmco: unref short failure + [2024-11-28 01:38:35] iteration 490/ 1000 | consumed samples: 31360 | elapsed time per iteration (ms): 67252.0 | throughput per GPU (TFLOP/s/GPU): 114.6 | learning rate: 2.748157E-06 | global batch size: 64 | lm loss: 6.273899E-01 | loss scale: 1.0 | grad norm: 0.730 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555ded7e1040] mmco: unref short failure +[h264 @ 0x55d9575fd4c0] mmco: unref short failure +[h264 @ 0x555ded7e1040] mmco: unref short failure +[h264 @ 0x55d9575fd4c0] mmco: unref short failure +[h264 @ 0x555ded7e1040] mmco: unref short failure +[h264 @ 0x555ded7e1040] mmco: unref short failure +[h264 @ 0x55d9575fd4c0] mmco: unref short failure +[h264 @ 0x55d9575fd4c0] mmco: unref short failure + [2024-11-28 01:40:13] iteration 491/ 1000 | consumed samples: 31424 | elapsed time per iteration (ms): 98216.4 | throughput per GPU (TFLOP/s/GPU): 78.5 | learning rate: 2.740247E-06 | global batch size: 64 | lm loss: 6.733717E-01 | loss scale: 1.0 | grad norm: 0.855 | number of skipped iterations: 0 | number of nan iterations: 0 | + [2024-11-28 01:41:59] iteration 492/ 1000 | consumed samples: 31488 | elapsed time per iteration (ms): 106125.0 | throughput per GPU (TFLOP/s/GPU): 72.6 | learning rate: 2.732335E-06 | global batch size: 64 | lm loss: 6.473522E-01 | loss scale: 1.0 | grad norm: 0.762 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555df4961e80] mmco: unref short failure +[h264 @ 0x55d9575fd4c0] mmco: unref short failure +[h264 @ 0x555dec836c00] mmco: unref short failure +[h264 @ 0x555dec836c00] mmco: unref short failure +[h264 @ 0x555dedfc6580] mmco: unref short failure +[h264 @ 0x555dedfc6580] mmco: unref short failure +[h264 @ 0x55d956b9c3c0] mmco: unref short failure +[h264 @ 0x55d956b9c3c0] mmco: unref short failure +[h264 @ 0x55d957fe7780] mmco: unref short failure +[h264 @ 0x55d957fe7780] mmco: unref short failure +[h264 @ 0x55d956b9c3c0] mmco: unref short failure +[h264 @ 0x555dedfc6580] mmco: unref short failure +[h264 @ 0x55d956b9c3c0] mmco: unref short failure +[h264 @ 0x555dedfc6580] mmco: unref short failure +[h264 @ 0x55d956b9c3c0] mmco: unref short failure +[h264 @ 0x55d956b9c3c0] mmco: unref short failure +[h264 @ 0x555dedfc6580] mmco: unref short failure +[h264 @ 0x555dedfc6580] mmco: unref short failure +[h264 @ 0x55d956b9c3c0] mmco: unref short failure +[h264 @ 0x555dedfc6580] mmco: unref short failure +[h264 @ 0x55d956b9c3c0] mmco: unref short failure +[h264 @ 0x555dedfc6580] mmco: unref short failure +[h264 @ 0x55d9575fd4c0] mmco: unref short failure +[h264 @ 0x555df4961e80] mmco: unref short failure +[h264 @ 0x55d956b9c3c0] mmco: unref short failure +[h264 @ 0x55d956b9c3c0] mmco: unref short failure +[h264 @ 0x555dedfc6580] mmco: unref short failure +[h264 @ 0x555dedfc6580] mmco: unref short failure + [2024-11-28 01:43:46] iteration 493/ 1000 | consumed samples: 31552 | elapsed time per iteration (ms): 106629.2 | throughput per GPU (TFLOP/s/GPU): 72.3 | learning rate: 2.724421E-06 | global batch size: 64 | lm loss: 6.177192E-01 | loss scale: 1.0 | grad norm: 0.816 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x55d957a1cec0] mmco: unref short failure +[h264 @ 0x555ded7e1040] mmco: unref short failure +[h264 @ 0x55d9586ad200] mmco: unref short failure +[h264 @ 0x555dedfc6580] mmco: unref short failure +[h264 @ 0x555dee0de680] mmco: unref short failure +[h264 @ 0x55d95873d100] mmco: unref short failure +[h264 @ 0x555dee0de680] mmco: unref short failure +[h264 @ 0x55d95873d100] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x555df32c1f40] mmco: unref short failure + [2024-11-28 01:45:03] iteration 494/ 1000 | consumed samples: 31616 | elapsed time per iteration (ms): 76563.4 | throughput per GPU (TFLOP/s/GPU): 100.7 | learning rate: 2.716506E-06 | global batch size: 64 | lm loss: 7.289105E-01 | loss scale: 1.0 | grad norm: 0.818 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x55d957f0f640] mmco: unref short failure +[h264 @ 0x555df4961e80] mmco: unref short failure +[h264 @ 0x55d957f0f640] mmco: unref short failure +[h264 @ 0x55d957f0f640] mmco: unref short failure +[h264 @ 0x555df4961e80] mmco: unref short failure +[h264 @ 0x555df4961e80] mmco: unref short failure +[h264 @ 0x555dedfc6580] mmco: unref short failure +[h264 @ 0x55d957a1cec0] mmco: unref short failure + [2024-11-28 01:46:25] iteration 495/ 1000 | consumed samples: 31680 | elapsed time per iteration (ms): 82717.2 | throughput per GPU (TFLOP/s/GPU): 93.2 | learning rate: 2.708588E-06 | global batch size: 64 | lm loss: 6.200334E-01 | loss scale: 1.0 | grad norm: 0.816 | number of skipped iterations: 0 | number of nan iterations: 0 | + [2024-11-28 01:47:53] iteration 496/ 1000 | consumed samples: 31744 | elapsed time per iteration (ms): 87485.8 | throughput per GPU (TFLOP/s/GPU): 88.1 | learning rate: 2.700669E-06 | global batch size: 64 | lm loss: 6.855780E-01 | loss scale: 1.0 | grad norm: 0.864 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555dedfc6580] mmco: unref short failure +[h264 @ 0x555dedfc6580] mmco: unref short failure +[h264 @ 0x55d956b9c3c0] mmco: unref short failure +[h264 @ 0x55d956b9c3c0] mmco: unref short failure +[h264 @ 0x55d956b9c3c0] mmco: unref short failure +[h264 @ 0x555dedfc6580] mmco: unref short failure +[h264 @ 0x555df4961e80] mmco: unref short failure +[h264 @ 0x55d957f0f640] mmco: unref short failure +[h264 @ 0x555dec836c00] mmco: unref short failure +[h264 @ 0x555dedfc6580] mmco: unref short failure +[h264 @ 0x555dec836c00] mmco: unref short failure +[h264 @ 0x555dec836c00] mmco: unref short failure +[h264 @ 0x55d957a1cec0] mmco: unref short failure +[h264 @ 0x55d95c76c1c0] mmco: unref short failure +[h264 @ 0x555df32c1f40] mmco: unref short failure +[h264 @ 0x555df32c1f40] mmco: unref short failure +[h264 @ 0x55d95c76c1c0] mmco: unref short failure +[h264 @ 0x55d95c76c1c0] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x555dec836c00] mmco: unref short failure +[h264 @ 0x55d95c76c1c0] mmco: unref short failure +[h264 @ 0x555df32c1f40] mmco: unref short failure +[h264 @ 0x555df32c1f40] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x555deebbd8c0] mmco: unref short failure +[h264 @ 0x555deebbd8c0] mmco: unref short failure +[h264 @ 0x55d956f98f80] mmco: unref short failure +[h264 @ 0x55d956f98f80] mmco: unref short failure +[h264 @ 0x555ded7e1040] mmco: unref short failure +[h264 @ 0x555ded7e1040] mmco: unref short failure +[h264 @ 0x55d957a1cec0] mmco: unref short failure +[h264 @ 0x55d957a1cec0] mmco: unref short failure +[h264 @ 0x55d957a1cec0] mmco: unref short failure +[h264 @ 0x55d957a1cec0] mmco: unref short failure +[h264 @ 0x555dedfc6580] mmco: unref short failure +[h264 @ 0x555dedfc6580] mmco: unref short failure +[h264 @ 0x555deebbd8c0] mmco: unref short failure +[h264 @ 0x55d957a1cec0] mmco: unref short failure +[h264 @ 0x55d957a1cec0] mmco: unref short failure +[h264 @ 0x555dedfc6580] mmco: unref short failure +[h264 @ 0x555dedfc6580] mmco: unref short failure +[h264 @ 0x55d957a1cec0] mmco: unref short failure +[h264 @ 0x555dedfc6580] mmco: unref short failure +[h264 @ 0x555deebbd8c0] mmco: unref short failure +[h264 @ 0x55d956f98f80] mmco: unref short failure +[h264 @ 0x55d956f98f80] mmco: unref short failure +[h264 @ 0x555deebbd8c0] mmco: unref short failure +[h264 @ 0x55d956f98f80] mmco: unref short failure +[h264 @ 0x555dedfc6580] mmco: unref short failure +[h264 @ 0x55d956f7b580] mmco: unref short failure + [2024-11-28 01:49:06] iteration 497/ 1000 | consumed samples: 31808 | elapsed time per iteration (ms): 72862.1 | throughput per GPU (TFLOP/s/GPU): 105.8 | learning rate: 2.692748E-06 | global batch size: 64 | lm loss: 6.573794E-01 | loss scale: 1.0 | grad norm: 0.970 | number of skipped iterations: 0 | number of nan iterations: 0 | + [2024-11-28 01:50:14] iteration 498/ 1000 | consumed samples: 31872 | elapsed time per iteration (ms): 68859.5 | throughput per GPU (TFLOP/s/GPU): 111.9 | learning rate: 2.684826E-06 | global batch size: 64 | lm loss: 7.352840E-01 | loss scale: 1.0 | grad norm: 0.937 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x55d95c76c1c0] mmco: unref short failure +[h264 @ 0x55d95c76c1c0] mmco: unref short failure +[h264 @ 0x555dec836c00] mmco: unref short failure +[h264 @ 0x555dec836c00] mmco: unref short failure +[h264 @ 0x55d95c76c1c0] mmco: unref short failure +[h264 @ 0x555dec836c00] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x55d9565f0700] mmco: unref short failure +[h264 @ 0x55d9565f0700] mmco: unref short failure + [2024-11-28 01:51:31] iteration 499/ 1000 | consumed samples: 31936 | elapsed time per iteration (ms): 76698.5 | throughput per GPU (TFLOP/s/GPU): 100.5 | learning rate: 2.676902E-06 | global batch size: 64 | lm loss: 6.518524E-01 | loss scale: 1.0 | grad norm: 0.750 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x555df32c1f40] mmco: unref short failure +[h264 @ 0x555df32c1f40] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x555dee0de680] mmco: unref short failure +[h264 @ 0x555df32c1f40] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x555dee0de680] mmco: unref short failure +[h264 @ 0x555dee0de680] mmco: unref short failure + [2024-11-28 01:52:58] iteration 500/ 1000 | consumed samples: 32000 | elapsed time per iteration (ms): 86461.1 | throughput per GPU (TFLOP/s/GPU): 89.2 | learning rate: 2.668977E-06 | global batch size: 64 | lm loss: 7.033284E-01 | loss scale: 1.0 | grad norm: 0.996 | number of skipped iterations: 0 | number of nan iterations: 0 | +(min, max) time across ranks (ms): + save-checkpoint ................................: (253152.30, 253152.62) +[h264 @ 0x55d957883f80] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x55d957883f80] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x55d957883f80] mmco: unref short failure +[h264 @ 0x55d957883f80] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x55d957883f80] mmco: unref short failure +[h264 @ 0x55d957883f80] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x55d957883f80] mmco: unref short failure +[h264 @ 0x55d957883f80] mmco: unref short failure +[h264 @ 0x55d9581bc7c0] mmco: unref short failure +[h264 @ 0x55d9581bc7c0] mmco: unref short failure +[h264 @ 0x55d9581bc7c0] mmco: unref short failure +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x55d9581bc7c0] mmco: unref short failure +[h264 @ 0x55d9581bc7c0] mmco: unref short failure +[h264 @ 0x555dee700e80] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure + [2024-11-28 01:58:33] iteration 501/ 1000 | consumed samples: 32064 | elapsed time per iteration (ms): 81971.3 | throughput per GPU (TFLOP/s/GPU): 94.0 | learning rate: 2.661051E-06 | global batch size: 64 | lm loss: 6.540481E-01 | loss scale: 1.0 | grad norm: 1.066 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x555dee700e80] mmco: unref short failure +[h264 @ 0x555dee700e80] mmco: unref short failure +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x555df1cb0600] mmco: unref short failure +[h264 @ 0x555df1cb0600] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x55d95707f900] mmco: unref short failure +[h264 @ 0x55d95707f900] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x555dee700e80] mmco: unref short failure +[h264 @ 0x555dee700e80] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x555dee700e80] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x555dee700e80] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x555dee700e80] mmco: unref short failure + [2024-11-28 01:59:52] iteration 502/ 1000 | consumed samples: 32128 | elapsed time per iteration (ms): 79089.7 | throughput per GPU (TFLOP/s/GPU): 97.5 | learning rate: 2.653124E-06 | global batch size: 64 | lm loss: 6.702466E-01 | loss scale: 1.0 | grad norm: 0.794 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555dee700e80] mmco: unref short failure +[h264 @ 0x555dee700e80] mmco: unref short failure +[h264 @ 0x555ded63c840] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x55d956f98f80] mmco: unref short failure +[h264 @ 0x555dee948bc0] mmco: unref short failure +[h264 @ 0x555dee948bc0] mmco: unref short failure +[h264 @ 0x55d95b718d00] mmco: unref short failure +[h264 @ 0x55d95b718d00] mmco: unref short failure +[h264 @ 0x555ded63c840] mmco: unref short failure +[h264 @ 0x555ded63c840] mmco: unref short failure +[h264 @ 0x555decb36640] mmco: unref short failure +[h264 @ 0x555decb36640] mmco: unref short failure +[h264 @ 0x55d956f98f80] mmco: unref short failure +[h264 @ 0x55d956f98f80] mmco: unref short failure +[h264 @ 0x55d9593709c0] mmco: unref short failure +[h264 @ 0x55d9593709c0] mmco: unref short failure + [2024-11-28 02:01:19] iteration 503/ 1000 | consumed samples: 32192 | elapsed time per iteration (ms): 87171.6 | throughput per GPU (TFLOP/s/GPU): 88.4 | learning rate: 2.645195E-06 | global batch size: 64 | lm loss: 6.394791E-01 | loss scale: 1.0 | grad norm: 0.770 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x55d955e4acc0] mmco: unref short failure +[h264 @ 0x55d955e4acc0] mmco: unref short failure +[h264 @ 0x555decea5e80] mmco: unref short failure +[h264 @ 0x555decea5e80] mmco: unref short failure +[h264 @ 0x55d956f98f80] mmco: unref short failure +[h264 @ 0x555ded63c840] mmco: unref short failure +[h264 @ 0x55d956f98f80] mmco: unref short failure +[h264 @ 0x555ded63c840] mmco: unref short failure +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x555ded63c840] mmco: unref short failure +[h264 @ 0x555ded63c840] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x55d956f98f80] mmco: unref short failure +[h264 @ 0x55d956f98f80] mmco: unref short failure +[h264 @ 0x555deea7ef00] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure + [2024-11-28 02:02:54] iteration 504/ 1000 | consumed samples: 32256 | elapsed time per iteration (ms): 95308.5 | throughput per GPU (TFLOP/s/GPU): 80.9 | learning rate: 2.637266E-06 | global batch size: 64 | lm loss: 7.753518E-01 | loss scale: 1.0 | grad norm: 1.224 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555ded696c00] mmco: unref short failure +[h264 @ 0x555ded696c00] mmco: unref short failure +[h264 @ 0x55d957a91640] mmco: unref short failure +[h264 @ 0x55d957a91640] mmco: unref short failure + [2024-11-28 02:04:44] iteration 505/ 1000 | consumed samples: 32320 | elapsed time per iteration (ms): 109858.1 | throughput per GPU (TFLOP/s/GPU): 70.2 | learning rate: 2.629336E-06 | global batch size: 64 | lm loss: 6.935418E-01 | loss scale: 1.0 | grad norm: 0.760 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x555df4961e80] mmco: unref short failure +[h264 @ 0x555df4961e80] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x55d959b34680] mmco: unref short failure +[h264 @ 0x55d959b34680] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x555df4961e80] mmco: unref short failure +[h264 @ 0x555df4961e80] mmco: unref short failure +[h264 @ 0x55d959b34680] mmco: unref short failure +[h264 @ 0x55d959b34680] mmco: unref short failure +[h264 @ 0x555df4961e80] mmco: unref short failure +[h264 @ 0x55d959b34680] mmco: unref short failure + [2024-11-28 02:07:14] iteration 506/ 1000 | consumed samples: 32384 | elapsed time per iteration (ms): 149922.7 | throughput per GPU (TFLOP/s/GPU): 51.4 | learning rate: 2.621404E-06 | global batch size: 64 | lm loss: 6.945059E-01 | loss scale: 1.0 | grad norm: 0.842 | number of skipped iterations: 0 | number of nan iterations: 0 | + [2024-11-28 02:08:31] iteration 507/ 1000 | consumed samples: 32448 | elapsed time per iteration (ms): 76903.3 | throughput per GPU (TFLOP/s/GPU): 100.2 | learning rate: 2.613473E-06 | global batch size: 64 | lm loss: 6.709553E-01 | loss scale: 1.0 | grad norm: 0.908 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555ded7e1040] mmco: unref short failure +[h264 @ 0x55d957a1cec0] mmco: unref short failure +[h264 @ 0x555ded7e1040] mmco: unref short failure +[h264 @ 0x55d957a1cec0] mmco: unref short failure + [2024-11-28 02:11:03] iteration 508/ 1000 | consumed samples: 32512 | elapsed time per iteration (ms): 152007.0 | throughput per GPU (TFLOP/s/GPU): 50.7 | learning rate: 2.605540E-06 | global batch size: 64 | lm loss: 6.527804E-01 | loss scale: 1.0 | grad norm: 1.022 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x55d956386f00] Missing reference picture, default is 65530 +[h264 @ 0x55d956386f00] mmco: unref short failure +[h264 @ 0x55d956386f00] mmco: unref short failure +[h264 @ 0x555dedf05880] Missing reference picture, default is 65530 +[h264 @ 0x555dedf05880] mmco: unref short failure +[h264 @ 0x555dedf05880] mmco: unref short failure +[h264 @ 0x55d959b34680] mmco: unref short failure +[h264 @ 0x55d959b34680] mmco: unref short failure +[h264 @ 0x555df4961e80] mmco: unref short failure +[h264 @ 0x555df4961e80] mmco: unref short failure + [2024-11-28 02:12:14] iteration 509/ 1000 | consumed samples: 32576 | elapsed time per iteration (ms): 71228.4 | throughput per GPU (TFLOP/s/GPU): 108.2 | learning rate: 2.597607E-06 | global batch size: 64 | lm loss: 6.511109E-01 | loss scale: 1.0 | grad norm: 0.923 | number of skipped iterations: 0 | number of nan iterations: 0 | + [2024-11-28 02:13:38] iteration 510/ 1000 | consumed samples: 32640 | elapsed time per iteration (ms): 83181.4 | throughput per GPU (TFLOP/s/GPU): 92.7 | learning rate: 2.589673E-06 | global batch size: 64 | lm loss: 7.232425E-01 | loss scale: 1.0 | grad norm: 1.021 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555ded7e1040] mmco: unref short failure +[h264 @ 0x55d956d792c0] mmco: unref short failure +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x55d9586ad200] mmco: unref short failure +[h264 @ 0x55d957f0f640] mmco: unref short failure +[h264 @ 0x55d957f0f640] mmco: unref short failure +[h264 @ 0x555dec5a5140] mmco: unref short failure +[h264 @ 0x555dec5a5140] mmco: unref short failure +[h264 @ 0x555deee52400] mmco: unref short failure +[h264 @ 0x555ded1fff00] mmco: unref short failure +[h264 @ 0x55d9581bc7c0] mmco: unref short failure +[h264 @ 0x55d955e4acc0] mmco: unref short failure +[h264 @ 0x555ded7e1040] mmco: unref short failure +[h264 @ 0x555ded7e1040] mmco: unref short failure +[h264 @ 0x55d9581bc7c0] mmco: unref short failure +[h264 @ 0x55d9581bc7c0] mmco: unref short failure +[h264 @ 0x55d957a1cec0] mmco: unref short failure +[h264 @ 0x555ded1fff00] mmco: unref short failure +[h264 @ 0x555ded1fff00] mmco: unref short failure +[h264 @ 0x55d957a1cec0] mmco: unref short failure +[h264 @ 0x55d9581bc7c0] mmco: unref short failure +[h264 @ 0x55d9581bc7c0] mmco: unref short failure +[h264 @ 0x555ded1fff00] mmco: unref short failure +[h264 @ 0x555ded1fff00] mmco: unref short failure + [2024-11-28 02:15:14] iteration 511/ 1000 | consumed samples: 32704 | elapsed time per iteration (ms): 96538.9 | throughput per GPU (TFLOP/s/GPU): 79.8 | learning rate: 2.581739E-06 | global batch size: 64 | lm loss: 6.949818E-01 | loss scale: 1.0 | grad norm: 0.920 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x55d957a91640] mmco: unref short failure +[h264 @ 0x55d957a91640] mmco: unref short failure +[h264 @ 0x555defd60880] mmco: unref short failure +[h264 @ 0x555defd60880] mmco: unref short failure +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x555dec5a5140] mmco: unref short failure +[h264 @ 0x555dec5a5140] mmco: unref short failure +[h264 @ 0x55d956f36840] mmco: unref short failure +[h264 @ 0x55d957f0f640] mmco: unref short failure +[h264 @ 0x55d957f0f640] mmco: unref short failure +[h264 @ 0x555df32c1f40] mmco: unref short failure +[h264 @ 0x555df32c1f40] mmco: unref short failure +[h264 @ 0x55d956ee5780] mmco: unref short failure +[h264 @ 0x55d956ee5780] mmco: unref short failure + [2024-11-28 02:16:38] iteration 512/ 1000 | consumed samples: 32768 | elapsed time per iteration (ms): 84289.3 | throughput per GPU (TFLOP/s/GPU): 91.5 | learning rate: 2.573804E-06 | global batch size: 64 | lm loss: 6.826892E-01 | loss scale: 1.0 | grad norm: 0.873 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555ded7e1040] mmco: unref short failure +[h264 @ 0x55d9575fd4c0] mmco: unref short failure +[h264 @ 0x55d957a91640] mmco: unref short failure +[h264 @ 0x55d957a91640] mmco: unref short failure +[h264 @ 0x555defd60880] mmco: unref short failure +[h264 @ 0x555defd60880] mmco: unref short failure + [2024-11-28 02:17:53] iteration 513/ 1000 | consumed samples: 32832 | elapsed time per iteration (ms): 74479.1 | throughput per GPU (TFLOP/s/GPU): 103.5 | learning rate: 2.565870E-06 | global batch size: 64 | lm loss: 6.208887E-01 | loss scale: 1.0 | grad norm: 0.850 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555defd60880] mmco: unref short failure +[h264 @ 0x55d957a91640] mmco: unref short failure +[h264 @ 0x555dec5a5140] mmco: unref short failure +[h264 @ 0x55d9559dfc40] mmco: unref short failure +[h264 @ 0x555defd60880] mmco: unref short failure +[h264 @ 0x555defd60880] mmco: unref short failure +[h264 @ 0x55d957a91640] mmco: unref short failure +[h264 @ 0x55d957a91640] mmco: unref short failure +[h264 @ 0x555defd60880] mmco: unref short failure +[h264 @ 0x555defd60880] mmco: unref short failure +[h264 @ 0x55d957a91640] mmco: unref short failure +[h264 @ 0x55d957a91640] mmco: unref short failure + [2024-11-28 02:19:45] iteration 514/ 1000 | consumed samples: 32896 | elapsed time per iteration (ms): 112413.5 | throughput per GPU (TFLOP/s/GPU): 68.6 | learning rate: 2.557935E-06 | global batch size: 64 | lm loss: 6.680191E-01 | loss scale: 1.0 | grad norm: 1.049 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x55d955e4acc0] mmco: unref short failure +[h264 @ 0x555deee52400] mmco: unref short failure +[h264 @ 0x555defd60880] mmco: unref short failure +[h264 @ 0x55d957a91640] mmco: unref short failure +[h264 @ 0x555defd60880] mmco: unref short failure +[h264 @ 0x555defd60880] mmco: unref short failure +[h264 @ 0x55d957a91640] mmco: unref short failure +[h264 @ 0x55d957a91640] mmco: unref short failure +[h264 @ 0x55d957a91640] mmco: unref short failure +[h264 @ 0x55d957a91640] mmco: unref short failure +[h264 @ 0x555defd60880] mmco: unref short failure +[h264 @ 0x555defd60880] mmco: unref short failure +[h264 @ 0x55d955e4acc0] mmco: unref short failure +[h264 @ 0x55d955e4acc0] mmco: unref short failure +[h264 @ 0x55d956ee5780] mmco: unref short failure +[h264 @ 0x555deee52400] mmco: unref short failure +[h264 @ 0x555deee52400] mmco: unref short failure +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x55d957a91640] mmco: unref short failure +[h264 @ 0x55d957a91640] mmco: unref short failure +[h264 @ 0x555defd60880] mmco: unref short failure +[h264 @ 0x555defd60880] mmco: unref short failure +[h264 @ 0x55d957a91640] mmco: unref short failure +[h264 @ 0x55d957a91640] mmco: unref short failure +[h264 @ 0x555defd60880] mmco: unref short failure +[h264 @ 0x555defd60880] mmco: unref short failure +[h264 @ 0x55d956ee5780] mmco: unref short failure +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x55d9575fd4c0] mmco: unref short failure +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x55d9575fd4c0] mmco: unref short failure +[h264 @ 0x55d9575fd4c0] mmco: unref short failure +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x55d956f36e40] mmco: unref short failure + [2024-11-28 02:21:03] iteration 515/ 1000 | consumed samples: 32960 | elapsed time per iteration (ms): 77731.3 | throughput per GPU (TFLOP/s/GPU): 99.2 | learning rate: 2.550000E-06 | global batch size: 64 | lm loss: 6.346695E-01 | loss scale: 1.0 | grad norm: 1.071 | number of skipped iterations: 0 | number of nan iterations: 0 | + [2024-11-28 02:22:56] iteration 516/ 1000 | consumed samples: 33024 | elapsed time per iteration (ms): 112534.7 | throughput per GPU (TFLOP/s/GPU): 68.5 | learning rate: 2.542065E-06 | global batch size: 64 | lm loss: 6.696709E-01 | loss scale: 1.0 | grad norm: 1.112 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555deee52400] mmco: unref short failure +[h264 @ 0x55d955e4acc0] mmco: unref short failure +[h264 @ 0x555deee52400] mmco: unref short failure +[h264 @ 0x55d955e4acc0] mmco: unref short failure +[h264 @ 0x555deee52400] mmco: unref short failure +[h264 @ 0x555deee52400] mmco: unref short failure +[h264 @ 0x55d955e4acc0] mmco: unref short failure +[h264 @ 0x55d955e4acc0] mmco: unref short failure +[h264 @ 0x555defd60880] mmco: unref short failure +[h264 @ 0x555defd60880] mmco: unref short failure +[h264 @ 0x55d957a91640] mmco: unref short failure +[h264 @ 0x55d957a91640] mmco: unref short failure +[h264 @ 0x555defd60880] mmco: unref short failure +[h264 @ 0x55d957a91640] mmco: unref short failure +[h264 @ 0x555deee52400] mmco: unref short failure +[h264 @ 0x55d955e4acc0] mmco: unref short failure +[h264 @ 0x55d956f36e40] mmco: unref short failure +[h264 @ 0x55d956f36e40] mmco: unref short failure +[h264 @ 0x55d956f36e40] mmco: unref short failure +[h264 @ 0x55d956f36e40] mmco: unref short failure +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x55d956f36e40] mmco: unref short failure +[h264 @ 0x55d956f36e40] mmco: unref short failure +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x555dec9973c0] mmco: unref short failure + [2024-11-28 02:24:38] iteration 517/ 1000 | consumed samples: 33088 | elapsed time per iteration (ms): 102222.6 | throughput per GPU (TFLOP/s/GPU): 75.4 | learning rate: 2.534130E-06 | global batch size: 64 | lm loss: 6.623139E-01 | loss scale: 1.0 | grad norm: 0.964 | number of skipped iterations: 0 | number of nan iterations: 0 | + [2024-11-28 02:26:02] iteration 518/ 1000 | consumed samples: 33152 | elapsed time per iteration (ms): 83961.8 | throughput per GPU (TFLOP/s/GPU): 91.8 | learning rate: 2.526196E-06 | global batch size: 64 | lm loss: 6.795787E-01 | loss scale: 1.0 | grad norm: 0.961 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555deee52400] mmco: unref short failure +[h264 @ 0x555deee52400] mmco: unref short failure +[h264 @ 0x55d955e4acc0] mmco: unref short failure +[h264 @ 0x55d955e4acc0] mmco: unref short failure + [2024-11-28 02:27:54] iteration 519/ 1000 | consumed samples: 33216 | elapsed time per iteration (ms): 111781.7 | throughput per GPU (TFLOP/s/GPU): 69.0 | learning rate: 2.518261E-06 | global batch size: 64 | lm loss: 6.612890E-01 | loss scale: 1.0 | grad norm: 0.910 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x55d957883f80] mmco: unref short failure +[h264 @ 0x55d957883f80] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x555df32c1f40] mmco: unref short failure +[h264 @ 0x555df32c1f40] mmco: unref short failure +[h264 @ 0x55d956386f00] mmco: unref short failure +[h264 @ 0x55d956386f00] mmco: unref short failure +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x55d956f7b580] mmco: unref short failure +[h264 @ 0x55d956f7b580] mmco: unref short failure +[h264 @ 0x555df32c1f40] mmco: unref short failure +[h264 @ 0x555df32c1f40] mmco: unref short failure +[h264 @ 0x55d956386f00] mmco: unref short failure +[h264 @ 0x55d956386f00] mmco: unref short failure +[h264 @ 0x555df32c1f40] mmco: unref short failure +[h264 @ 0x555df32c1f40] mmco: unref short failure +[h264 @ 0x55d956386f00] mmco: unref short failure +[h264 @ 0x55d956386f00] mmco: unref short failure + [2024-11-28 02:29:16] iteration 520/ 1000 | consumed samples: 33280 | elapsed time per iteration (ms): 82821.7 | throughput per GPU (TFLOP/s/GPU): 93.1 | learning rate: 2.510327E-06 | global batch size: 64 | lm loss: 6.640331E-01 | loss scale: 1.0 | grad norm: 0.889 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x55d956f7b580] mmco: unref short failure +[h264 @ 0x55d956f7b580] mmco: unref short failure +[h264 @ 0x55d955e4acc0] mmco: unref short failure +[h264 @ 0x55d955e4acc0] mmco: unref short failure +[h264 @ 0x555deee52400] mmco: unref short failure +[h264 @ 0x555deee52400] mmco: unref short failure +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x55d956f7b580] mmco: unref short failure +[h264 @ 0x55d956f7b580] mmco: unref short failure +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x55d956f7b580] mmco: unref short failure +[h264 @ 0x555deee52400] mmco: unref short failure +[h264 @ 0x555deee52400] mmco: unref short failure +[h264 @ 0x55d955e4acc0] mmco: unref short failure +[h264 @ 0x55d955e4acc0] mmco: unref short failure +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x55d956f7b580] mmco: unref short failure +[h264 @ 0x55d956f7b580] mmco: unref short failure +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x55d955e4acc0] mmco: unref short failure +[h264 @ 0x55d955e4acc0] mmco: unref short failure +[h264 @ 0x555deee52400] mmco: unref short failure +[h264 @ 0x555deee52400] mmco: unref short failure +[h264 @ 0x55d956f7b580] mmco: unref short failure +[h264 @ 0x55d955e4acc0] mmco: unref short failure +[h264 @ 0x55d955e4acc0] mmco: unref short failure +[h264 @ 0x555deee52400] mmco: unref short failure +[h264 @ 0x555deee52400] mmco: unref short failure +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x55d956f36840] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x55d957883f80] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x55d957883f80] mmco: unref short failure + [2024-11-28 02:30:56] iteration 521/ 1000 | consumed samples: 33344 | elapsed time per iteration (ms): 99758.6 | throughput per GPU (TFLOP/s/GPU): 77.3 | learning rate: 2.502393E-06 | global batch size: 64 | lm loss: 7.587718E-01 | loss scale: 1.0 | grad norm: 0.721 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x55d957eff780] mmco: unref short failure +[h264 @ 0x555df32c1f40] mmco: unref short failure +[h264 @ 0x55d956386f00] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x55d957eff780] mmco: unref short failure +[h264 @ 0x55d956386f00] mmco: unref short failure +[h264 @ 0x555df32c1f40] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x55d957eff780] mmco: unref short failure + [2024-11-28 02:32:14] iteration 522/ 1000 | consumed samples: 33408 | elapsed time per iteration (ms): 77707.6 | throughput per GPU (TFLOP/s/GPU): 99.2 | learning rate: 2.494460E-06 | global batch size: 64 | lm loss: 6.407965E-01 | loss scale: 1.0 | grad norm: 0.923 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x55d957883f80] mmco: unref short failure +[h264 @ 0x55d957883f80] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x55d957883f80] mmco: unref short failure + [2024-11-28 02:33:56] iteration 523/ 1000 | consumed samples: 33472 | elapsed time per iteration (ms): 102303.9 | throughput per GPU (TFLOP/s/GPU): 75.3 | learning rate: 2.486527E-06 | global batch size: 64 | lm loss: 7.260080E-01 | loss scale: 1.0 | grad norm: 1.015 | number of skipped iterations: 0 | number of nan iterations: 0 | + [2024-11-28 02:35:12] iteration 524/ 1000 | consumed samples: 33536 | elapsed time per iteration (ms): 76197.9 | throughput per GPU (TFLOP/s/GPU): 101.2 | learning rate: 2.478596E-06 | global batch size: 64 | lm loss: 6.908966E-01 | loss scale: 1.0 | grad norm: 0.916 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555df0f7cd00] mmco: unref short failure +[h264 @ 0x55d956ee5780] mmco: unref short failure +[h264 @ 0x555df0f7cd00] mmco: unref short failure +[h264 @ 0x555df0f7cd00] mmco: unref short failure +[h264 @ 0x55d956ee5780] mmco: unref short failure +[h264 @ 0x55d956ee5780] mmco: unref short failure +[h264 @ 0x55d957a91640] mmco: unref short failure +[h264 @ 0x55d957a91640] mmco: unref short failure +[h264 @ 0x555defd60880] mmco: unref short failure +[h264 @ 0x555defd60880] mmco: unref short failure +[h264 @ 0x55d957a91640] mmco: unref short failure +[h264 @ 0x55d957a91640] mmco: unref short failure +[h264 @ 0x555defd60880] mmco: unref short failure +[h264 @ 0x555defd60880] mmco: unref short failure +[h264 @ 0x55d957a91640] mmco: unref short failure +[h264 @ 0x555defd60880] mmco: unref short failure +[h264 @ 0x55d957a91640] mmco: unref short failure +[h264 @ 0x55d957a91640] mmco: unref short failure +[h264 @ 0x555defd60880] mmco: unref short failure +[h264 @ 0x555defd60880] mmco: unref short failure + [2024-11-28 02:36:29] iteration 525/ 1000 | consumed samples: 33600 | elapsed time per iteration (ms): 76456.3 | throughput per GPU (TFLOP/s/GPU): 100.8 | learning rate: 2.470664E-06 | global batch size: 64 | lm loss: 6.749615E-01 | loss scale: 1.0 | grad norm: 0.942 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x55d956f36840] mmco: unref short failure +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x55d956f36840] mmco: unref short failure +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x55d956f36840] mmco: unref short failure +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x55d957a91640] mmco: unref short failure +[h264 @ 0x555defd60880] mmco: unref short failure +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x55d956f36e40] mmco: unref short failure +[h264 @ 0x55d956f36e40] mmco: unref short failure +[h264 @ 0x55d957a91640] mmco: unref short failure +[h264 @ 0x55d957a91640] mmco: unref short failure +[h264 @ 0x555defd60880] mmco: unref short failure +[h264 @ 0x555defd60880] mmco: unref short failure +[h264 @ 0x55d956f36e40] mmco: unref short failure +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x55d956f36e40] mmco: unref short failure +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x55d957a91640] mmco: unref short failure +[h264 @ 0x55d957a91640] mmco: unref short failure +[h264 @ 0x555defd60880] mmco: unref short failure +[h264 @ 0x555defd60880] mmco: unref short failure +[h264 @ 0x55d957a91640] mmco: unref short failure +[h264 @ 0x555defd60880] mmco: unref short failure +[h264 @ 0x55d957a91640] mmco: unref short failure +[h264 @ 0x55d957a91640] mmco: unref short failure +[h264 @ 0x555defd60880] mmco: unref short failure +[h264 @ 0x555defd60880] mmco: unref short failure + [2024-11-28 02:38:03] iteration 526/ 1000 | consumed samples: 33664 | elapsed time per iteration (ms): 93927.9 | throughput per GPU (TFLOP/s/GPU): 82.1 | learning rate: 2.462734E-06 | global batch size: 64 | lm loss: 6.362700E-01 | loss scale: 1.0 | grad norm: 0.938 | number of skipped iterations: 0 | number of nan iterations: 0 | + [2024-11-28 02:39:42] iteration 527/ 1000 | consumed samples: 33728 | elapsed time per iteration (ms): 99402.2 | throughput per GPU (TFLOP/s/GPU): 77.5 | learning rate: 2.454805E-06 | global batch size: 64 | lm loss: 6.780536E-01 | loss scale: 1.0 | grad norm: 9.398 | number of skipped iterations: 0 | number of nan iterations: 0 | + [2024-11-28 02:41:00] iteration 528/ 1000 | consumed samples: 33792 | elapsed time per iteration (ms): 77721.5 | throughput per GPU (TFLOP/s/GPU): 99.2 | learning rate: 2.446876E-06 | global batch size: 64 | lm loss: 8.120739E-01 | loss scale: 1.0 | grad norm: 0.890 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x55d956f36840] mmco: unref short failure +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x55d956f36840] mmco: unref short failure +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x55d956f36840] mmco: unref short failure +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x55d95873d100] mmco: unref short failure +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x55d956ee5780] mmco: unref short failure +[h264 @ 0x55d956ee5780] mmco: unref short failure +[h264 @ 0x555dee570b40] mmco: unref short failure +[h264 @ 0x555dee570b40] mmco: unref short failure +[h264 @ 0x55d956d792c0] mmco: unref short failure +[h264 @ 0x55d956d792c0] mmco: unref short failure +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x555deee52400] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x55d956f36840] mmco: unref short failure +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x55d956f36e40] mmco: unref short failure +[h264 @ 0x555dee0de680] mmco: unref short failure +[h264 @ 0x55d95c76c1c0] mmco: unref short failure +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x55d956f36840] mmco: unref short failure +[h264 @ 0x555deee52400] mmco: unref short failure +[h264 @ 0x555deee52400] mmco: unref short failure +[h264 @ 0x555deee52400] mmco: unref short failure +[h264 @ 0x555deee52400] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x555dee0de680] mmco: unref short failure +[h264 @ 0x555dee0de680] mmco: unref short failure +[h264 @ 0x55d95c76c1c0] mmco: unref short failure +[h264 @ 0x55d95c76c1c0] mmco: unref short failure + [2024-11-28 02:42:50] iteration 529/ 1000 | consumed samples: 33856 | elapsed time per iteration (ms): 110064.9 | throughput per GPU (TFLOP/s/GPU): 70.0 | learning rate: 2.438949E-06 | global batch size: 64 | lm loss: 6.335684E-01 | loss scale: 1.0 | grad norm: 0.962 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x55d956ee5780] mmco: unref short failure +[h264 @ 0x55d956ee5780] mmco: unref short failure +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x55d956ee5780] mmco: unref short failure +[h264 @ 0x55d956ee5780] mmco: unref short failure +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x55d956f36840] mmco: unref short failure +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x55d956f36840] mmco: unref short failure +[h264 @ 0x55d956f36840] mmco: unref short failure + [2024-11-28 02:44:03] iteration 530/ 1000 | consumed samples: 33920 | elapsed time per iteration (ms): 72503.1 | throughput per GPU (TFLOP/s/GPU): 106.3 | learning rate: 2.431023E-06 | global batch size: 64 | lm loss: 6.750400E-01 | loss scale: 1.0 | grad norm: 0.750 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x55d956f36840] mmco: unref short failure +[h264 @ 0x55d956f36840] mmco: unref short failure + [2024-11-28 02:45:33] iteration 531/ 1000 | consumed samples: 33984 | elapsed time per iteration (ms): 90134.3 | throughput per GPU (TFLOP/s/GPU): 85.5 | learning rate: 2.423098E-06 | global batch size: 64 | lm loss: 6.134099E-01 | loss scale: 1.0 | grad norm: 0.771 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x55d95c76c1c0] mmco: unref short failure +[h264 @ 0x555dee0de680] mmco: unref short failure +[h264 @ 0x55d95c76c1c0] mmco: unref short failure +[h264 @ 0x555dee0de680] mmco: unref short failure +[h264 @ 0x55d95c76c1c0] mmco: unref short failure +[h264 @ 0x55d95c76c1c0] mmco: unref short failure +[h264 @ 0x555dee0de680] mmco: unref short failure +[h264 @ 0x555dee0de680] mmco: unref short failure +[h264 @ 0x555dedfc6580] mmco: unref short failure +[h264 @ 0x555dedfc6580] mmco: unref short failure +[h264 @ 0x55d956f98f80] mmco: unref short failure +[h264 @ 0x55d956f98f80] mmco: unref short failure +[h264 @ 0x555dee0de680] mmco: unref short failure +[h264 @ 0x555dee0de680] mmco: unref short failure +[h264 @ 0x55d95c76c1c0] mmco: unref short failure +[h264 @ 0x55d95c76c1c0] mmco: unref short failure +[h264 @ 0x555dedfc6580] mmco: unref short failure +[h264 @ 0x555dedfc6580] mmco: unref short failure +[h264 @ 0x55d956f98f80] mmco: unref short failure +[h264 @ 0x55d956f98f80] mmco: unref short failure +[h264 @ 0x555dedfc6580] mmco: unref short failure +[h264 @ 0x555dedfc6580] mmco: unref short failure +[h264 @ 0x55d956f98f80] mmco: unref short failure +[h264 @ 0x55d956f98f80] mmco: unref short failure +[h264 @ 0x555dee0de680] mmco: unref short failure +[h264 @ 0x55d95c76c1c0] mmco: unref short failure +[h264 @ 0x555dee0de680] mmco: unref short failure +[h264 @ 0x55d95c76c1c0] mmco: unref short failure +[h264 @ 0x555dee0de680] mmco: unref short failure +[h264 @ 0x55d95c76c1c0] mmco: unref short failure +[h264 @ 0x555dee0de680] mmco: unref short failure +[h264 @ 0x555dee0de680] mmco: unref short failure +[h264 @ 0x55d95c76c1c0] mmco: unref short failure +[h264 @ 0x55d95c76c1c0] mmco: unref short failure +[h264 @ 0x55d95c76c1c0] mmco: unref short failure +[h264 @ 0x555dee0de680] mmco: unref short failure + [2024-11-28 02:47:06] iteration 532/ 1000 | consumed samples: 34048 | elapsed time per iteration (ms): 92802.6 | throughput per GPU (TFLOP/s/GPU): 83.1 | learning rate: 2.415174E-06 | global batch size: 64 | lm loss: 6.257591E-01 | loss scale: 1.0 | grad norm: 0.961 | number of skipped iterations: 0 | number of nan iterations: 0 | + [2024-11-28 02:48:30] iteration 533/ 1000 | consumed samples: 34112 | elapsed time per iteration (ms): 84370.2 | throughput per GPU (TFLOP/s/GPU): 91.4 | learning rate: 2.407252E-06 | global batch size: 64 | lm loss: 6.479869E-01 | loss scale: 1.0 | grad norm: 0.742 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x55d957a03080] mmco: unref short failure + [2024-11-28 02:49:39] iteration 534/ 1000 | consumed samples: 34176 | elapsed time per iteration (ms): 68564.5 | throughput per GPU (TFLOP/s/GPU): 112.4 | learning rate: 2.399331E-06 | global batch size: 64 | lm loss: 6.712840E-01 | loss scale: 1.0 | grad norm: 0.770 | number of skipped iterations: 0 | number of nan iterations: 0 | + [2024-11-28 02:51:14] iteration 535/ 1000 | consumed samples: 34240 | elapsed time per iteration (ms): 95872.4 | throughput per GPU (TFLOP/s/GPU): 80.4 | learning rate: 2.391412E-06 | global batch size: 64 | lm loss: 6.123539E-01 | loss scale: 1.0 | grad norm: 0.920 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555dee0f8a80] mmco: unref short failure +[h264 @ 0x555dee0f8a80] mmco: unref short failure +[h264 @ 0x55d9575fd4c0] mmco: unref short failure +[h264 @ 0x55d9575fd4c0] mmco: unref short failure +[h264 @ 0x555df4961e80] mmco: unref short failure +[h264 @ 0x55d959b34680] mmco: unref short failure +[h264 @ 0x555df4961e80] mmco: unref short failure +[h264 @ 0x55d959b34680] mmco: unref short failure +[h264 @ 0x555df4961e80] mmco: unref short failure +[h264 @ 0x555df4961e80] mmco: unref short failure +[h264 @ 0x55d959b34680] mmco: unref short failure +[h264 @ 0x55d959b34680] mmco: unref short failure +[h264 @ 0x555df4961e80] mmco: unref short failure +[h264 @ 0x55d959b34680] mmco: unref short failure +[h264 @ 0x555df4961e80] mmco: unref short failure +[h264 @ 0x555df4961e80] mmco: unref short failure +[h264 @ 0x55d959b34680] mmco: unref short failure +[h264 @ 0x55d959b34680] mmco: unref short failure +[h264 @ 0x555df4961e80] mmco: unref short failure +[h264 @ 0x555df4961e80] mmco: unref short failure +[h264 @ 0x55d959b34680] mmco: unref short failure +[h264 @ 0x55d959b34680] mmco: unref short failure + [2024-11-28 02:52:25] iteration 536/ 1000 | consumed samples: 34304 | elapsed time per iteration (ms): 70396.4 | throughput per GPU (TFLOP/s/GPU): 109.5 | learning rate: 2.383494E-06 | global batch size: 64 | lm loss: 6.391864E-01 | loss scale: 1.0 | grad norm: 0.809 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x55d956042040] mmco: unref short failure +[h264 @ 0x55d956042040] mmco: unref short failure +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x555df4961e80] mmco: unref short failure +[h264 @ 0x555df4961e80] mmco: unref short failure +[h264 @ 0x55d959b29f40] mmco: unref short failure +[h264 @ 0x55d959b29f40] mmco: unref short failure +[h264 @ 0x55d956f36840] mmco: unref short failure +[h264 @ 0x55d956f36840] mmco: unref short failure +[h264 @ 0x555def77b080] mmco: unref short failure +[h264 @ 0x555def77b080] mmco: unref short failure +[h264 @ 0x55d956f36840] mmco: unref short failure +[h264 @ 0x555def77b080] mmco: unref short failure +[h264 @ 0x555df4961e80] mmco: unref short failure +[h264 @ 0x555df4961e80] mmco: unref short failure +[h264 @ 0x55d959b29f40] mmco: unref short failure +[h264 @ 0x55d959b29f40] mmco: unref short failure + [2024-11-28 02:54:03] iteration 537/ 1000 | consumed samples: 34368 | elapsed time per iteration (ms): 97681.3 | throughput per GPU (TFLOP/s/GPU): 78.9 | learning rate: 2.375579E-06 | global batch size: 64 | lm loss: 6.583289E-01 | loss scale: 1.0 | grad norm: 0.947 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555ded7e1040] mmco: unref short failure +[h264 @ 0x555ded7e1040] mmco: unref short failure +[h264 @ 0x55d956d792c0] mmco: unref short failure +[h264 @ 0x55d956d792c0] mmco: unref short failure +[h264 @ 0x555dedf05880] mmco: unref short failure +[h264 @ 0x55d957fbfd40] mmco: unref short failure +[h264 @ 0x555dedb7a200] mmco: unref short failure +[h264 @ 0x55d956aae040] mmco: unref short failure +[h264 @ 0x555dedf05880] mmco: unref short failure +[h264 @ 0x55d957fbfd40] mmco: unref short failure + [2024-11-28 02:55:47] iteration 538/ 1000 | consumed samples: 34432 | elapsed time per iteration (ms): 104400.8 | throughput per GPU (TFLOP/s/GPU): 73.8 | learning rate: 2.367665E-06 | global batch size: 64 | lm loss: 7.243518E-01 | loss scale: 1.0 | grad norm: 0.920 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555deda7a5c0] mmco: unref short failure +[h264 @ 0x555deda7a5c0] mmco: unref short failure +[h264 @ 0x55d956f36e40] mmco: unref short failure +[h264 @ 0x55d956f36e40] mmco: unref short failure + [2024-11-28 02:57:07] iteration 539/ 1000 | consumed samples: 34496 | elapsed time per iteration (ms): 80003.5 | throughput per GPU (TFLOP/s/GPU): 96.4 | learning rate: 2.359753E-06 | global batch size: 64 | lm loss: 6.497184E-01 | loss scale: 1.0 | grad norm: 1.207 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555dee700e80] mmco: unref short failure +[h264 @ 0x555dee700e80] mmco: unref short failure +[h264 @ 0x55d957fe7780] mmco: unref short failure +[h264 @ 0x55d957fe7780] mmco: unref short failure + [2024-11-28 02:59:06] iteration 540/ 1000 | consumed samples: 34560 | elapsed time per iteration (ms): 119114.9 | throughput per GPU (TFLOP/s/GPU): 64.7 | learning rate: 2.351843E-06 | global batch size: 64 | lm loss: 6.795231E-01 | loss scale: 1.0 | grad norm: 0.982 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x55d956d792c0] mmco: unref short failure +[h264 @ 0x55d956d792c0] mmco: unref short failure +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x55d956d792c0] mmco: unref short failure +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x55d956d792c0] mmco: unref short failure +[h264 @ 0x55d956d792c0] mmco: unref short failure +[h264 @ 0x55d956d792c0] mmco: unref short failure +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x55d956d792c0] mmco: unref short failure +[h264 @ 0x55d956d792c0] mmco: unref short failure +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x55d956d792c0] mmco: unref short failure +[h264 @ 0x55d956d792c0] mmco: unref short failure +[h264 @ 0x55d956d792c0] mmco: unref short failure +[h264 @ 0x555dee700e80] mmco: unref short failure +[h264 @ 0x55d957fe7780] mmco: unref short failure + [2024-11-28 03:00:26] iteration 541/ 1000 | consumed samples: 34624 | elapsed time per iteration (ms): 80182.6 | throughput per GPU (TFLOP/s/GPU): 96.1 | learning rate: 2.343935E-06 | global batch size: 64 | lm loss: 7.331969E-01 | loss scale: 1.0 | grad norm: 0.863 | number of skipped iterations: 0 | number of nan iterations: 0 | + [2024-11-28 03:01:58] iteration 542/ 1000 | consumed samples: 34688 | elapsed time per iteration (ms): 91273.9 | throughput per GPU (TFLOP/s/GPU): 84.5 | learning rate: 2.336029E-06 | global batch size: 64 | lm loss: 7.497106E-01 | loss scale: 1.0 | grad norm: 1.812 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555dee700e80] mmco: unref short failure +[h264 @ 0x55d957fe7780] mmco: unref short failure +[h264 @ 0x555dee700e80] mmco: unref short failure +[h264 @ 0x555dee700e80] mmco: unref short failure +[h264 @ 0x55d957fe7780] mmco: unref short failure +[h264 @ 0x55d957fe7780] mmco: unref short failure + [2024-11-28 03:03:11] iteration 543/ 1000 | consumed samples: 34752 | elapsed time per iteration (ms): 73655.6 | throughput per GPU (TFLOP/s/GPU): 104.7 | learning rate: 2.328126E-06 | global batch size: 64 | lm loss: 6.712408E-01 | loss scale: 1.0 | grad norm: 35.621 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555deda7a5c0] mmco: unref short failure +[h264 @ 0x55d956b88f40] mmco: unref short failure +[h264 @ 0x555dee700e80] mmco: unref short failure +[h264 @ 0x555dee700e80] mmco: unref short failure +[h264 @ 0x55d957fe7780] mmco: unref short failure +[h264 @ 0x55d957fe7780] mmco: unref short failure +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x55d956f36e40] mmco: unref short failure +[h264 @ 0x55d956f36e40] mmco: unref short failure +[h264 @ 0x55d956f36e40] mmco: unref short failure +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x55d956f36e40] mmco: unref short failure +[h264 @ 0x55d956f36e40] mmco: unref short failure +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x55d956f36e40] mmco: unref short failure +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x55d956f36e40] mmco: unref short failure +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x55d956f36e40] mmco: unref short failure +[h264 @ 0x55d956f36e40] mmco: unref short failure +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x55d956f36e40] mmco: unref short failure +[h264 @ 0x55d956f36e40] mmco: unref short failure +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x55d956f36e40] mmco: unref short failure + [2024-11-28 03:05:10] iteration 544/ 1000 | consumed samples: 34816 | elapsed time per iteration (ms): 118369.6 | throughput per GPU (TFLOP/s/GPU): 65.1 | learning rate: 2.320225E-06 | global batch size: 64 | lm loss: 6.137346E-01 | loss scale: 1.0 | grad norm: 0.902 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x55d957fe7780] mmco: unref short failure +[h264 @ 0x55d957fe7780] mmco: unref short failure +[h264 @ 0x555dee700e80] mmco: unref short failure +[h264 @ 0x555dee700e80] mmco: unref short failure +[h264 @ 0x55d957fe7780] mmco: unref short failure +[h264 @ 0x555dee700e80] mmco: unref short failure +[h264 @ 0x55d957fe7780] mmco: unref short failure +[h264 @ 0x55d957fe7780] mmco: unref short failure +[h264 @ 0x555dee700e80] mmco: unref short failure +[h264 @ 0x555dee700e80] mmco: unref short failure +[h264 @ 0x55d957fe7780] mmco: unref short failure +[h264 @ 0x55d957fe7780] mmco: unref short failure +[h264 @ 0x555dee700e80] mmco: unref short failure +[h264 @ 0x555dee700e80] mmco: unref short failure +[h264 @ 0x55d957fe7780] mmco: unref short failure +[h264 @ 0x555dee700e80] mmco: unref short failure +[h264 @ 0x55d957fe7780] mmco: unref short failure +[h264 @ 0x555dee700e80] mmco: unref short failure +[h264 @ 0x55d957fe7780] mmco: unref short failure +[h264 @ 0x55d957fe7780] mmco: unref short failure +[h264 @ 0x555dee700e80] mmco: unref short failure +[h264 @ 0x555dee700e80] mmco: unref short failure +[h264 @ 0x55d957fe7780] mmco: unref short failure +[h264 @ 0x555dee700e80] mmco: unref short failure +[h264 @ 0x55d957fe7780] mmco: unref short failure +[h264 @ 0x555dee700e80] mmco: unref short failure +[h264 @ 0x55d957fe7780] mmco: unref short failure +[h264 @ 0x55d957fe7780] mmco: unref short failure +[h264 @ 0x555dee700e80] mmco: unref short failure +[h264 @ 0x555dee700e80] mmco: unref short failure +[h264 @ 0x55d957fe7780] mmco: unref short failure +[h264 @ 0x555dee700e80] mmco: unref short failure +[h264 @ 0x55d957fe7780] mmco: unref short failure +[h264 @ 0x555dee700e80] mmco: unref short failure +[h264 @ 0x55d957fe7780] mmco: unref short failure +[h264 @ 0x555dee700e80] mmco: unref short failure +[h264 @ 0x55d957fe7780] mmco: unref short failure +[h264 @ 0x55d957fe7780] mmco: unref short failure +[h264 @ 0x555dee700e80] mmco: unref short failure +[h264 @ 0x555dee700e80] mmco: unref short failure +[h264 @ 0x555deee52400] mmco: unref short failure +[h264 @ 0x555deee52400] mmco: unref short failure +[h264 @ 0x55d9560849c0] mmco: unref short failure +[h264 @ 0x55d9560849c0] mmco: unref short failure + [2024-11-28 03:08:43] iteration 545/ 1000 | consumed samples: 34880 | elapsed time per iteration (ms): 213453.2 | throughput per GPU (TFLOP/s/GPU): 36.1 | learning rate: 2.312326E-06 | global batch size: 64 | lm loss: 6.905041E-01 | loss scale: 1.0 | grad norm: 2.642 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555df4961e80] mmco: unref short failure +[h264 @ 0x55d957fbfd40] mmco: unref short failure +[h264 @ 0x555df4961e80] mmco: unref short failure +[h264 @ 0x555df4961e80] mmco: unref short failure +[h264 @ 0x55d957fbfd40] mmco: unref short failure +[h264 @ 0x55d957fbfd40] mmco: unref short failure +[h264 @ 0x555deee52400] mmco: unref short failure +[h264 @ 0x555deee52400] mmco: unref short failure +[h264 @ 0x55d9560849c0] mmco: unref short failure +[h264 @ 0x55d9560849c0] mmco: unref short failure +[h264 @ 0x555ded679600] mmco: unref short failure +[h264 @ 0x55d95c76c1c0] mmco: unref short failure +[h264 @ 0x555ded679600] mmco: unref short failure +[h264 @ 0x55d95c76c1c0] mmco: unref short failure + [2024-11-28 03:10:41] iteration 546/ 1000 | consumed samples: 34944 | elapsed time per iteration (ms): 118083.8 | throughput per GPU (TFLOP/s/GPU): 65.3 | learning rate: 2.304430E-06 | global batch size: 64 | lm loss: 6.795663E-01 | loss scale: 1.0 | grad norm: 0.895 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555ded679600] mmco: unref short failure +[h264 @ 0x55d95c76c1c0] mmco: unref short failure +[h264 @ 0x555ded679600] mmco: unref short failure +[h264 @ 0x55d95c76c1c0] mmco: unref short failure +[h264 @ 0x555ded679600] mmco: unref short failure +[h264 @ 0x55d95c76c1c0] mmco: unref short failure + [2024-11-28 03:13:12] iteration 547/ 1000 | consumed samples: 35008 | elapsed time per iteration (ms): 151065.5 | throughput per GPU (TFLOP/s/GPU): 51.0 | learning rate: 2.296536E-06 | global batch size: 64 | lm loss: 6.740454E-01 | loss scale: 1.0 | grad norm: 0.963 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x55d956fbbd80] mmco: unref short failure +[h264 @ 0x55d956fbbd80] mmco: unref short failure +[h264 @ 0x555dee700e80] mmco: unref short failure +[h264 @ 0x555dee700e80] mmco: unref short failure + [2024-11-28 03:14:56] iteration 548/ 1000 | consumed samples: 35072 | elapsed time per iteration (ms): 103599.5 | throughput per GPU (TFLOP/s/GPU): 74.4 | learning rate: 2.288645E-06 | global batch size: 64 | lm loss: 6.875500E-01 | loss scale: 1.0 | grad norm: 0.919 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555df4961e80] mmco: unref short failure +[h264 @ 0x55d957fbfd40] mmco: unref short failure + [2024-11-28 03:17:11] iteration 549/ 1000 | consumed samples: 35136 | elapsed time per iteration (ms): 135405.8 | throughput per GPU (TFLOP/s/GPU): 56.9 | learning rate: 2.280757E-06 | global batch size: 64 | lm loss: 7.654210E-01 | loss scale: 1.0 | grad norm: 1.331 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x55d956defa80] mmco: unref short failure +[h264 @ 0x55d956defa80] mmco: unref short failure +[h264 @ 0x555ded624e00] mmco: unref short failure +[h264 @ 0x555ded624e00] mmco: unref short failure +[h264 @ 0x55d956defa80] mmco: unref short failure +[h264 @ 0x55d956defa80] mmco: unref short failure +[h264 @ 0x555ded624e00] mmco: unref short failure +[h264 @ 0x555ded624e00] mmco: unref short failure +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x55d956f36840] mmco: unref short failure +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x55d956f36840] mmco: unref short failure +[h264 @ 0x55d956f36840] mmco: unref short failure +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x55d956f36840] mmco: unref short failure +[h264 @ 0x55d956f36840] mmco: unref short failure +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x55d956f36840] mmco: unref short failure +[h264 @ 0x55d956f36840] mmco: unref short failure +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x55d956f36840] mmco: unref short failure +[h264 @ 0x55d956f36840] mmco: unref short failure +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x55d956f36840] mmco: unref short failure +[h264 @ 0x55d956f36840] mmco: unref short failure +[h264 @ 0x555ded624e00] mmco: unref short failure +[h264 @ 0x555ded624e00] mmco: unref short failure +[h264 @ 0x55d956defa80] mmco: unref short failure +[h264 @ 0x55d956defa80] mmco: unref short failure +[h264 @ 0x555ded624e00] mmco: unref short failure +[h264 @ 0x555ded624e00] mmco: unref short failure +[h264 @ 0x55d956defa80] mmco: unref short failure +[h264 @ 0x55d956defa80] mmco: unref short failure + [2024-11-28 03:18:34] iteration 550/ 1000 | consumed samples: 35200 | elapsed time per iteration (ms): 83113.3 | throughput per GPU (TFLOP/s/GPU): 92.7 | learning rate: 2.272871E-06 | global batch size: 64 | lm loss: 7.020198E-01 | loss scale: 1.0 | grad norm: 0.947 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x55d957a03080] mmco: unref short failure +[h264 @ 0x55d957a03080] mmco: unref short failure +[h264 @ 0x555df32c1f40] mmco: unref short failure +[h264 @ 0x555df32c1f40] mmco: unref short failure +[h264 @ 0x55d95c76c1c0] mmco: unref short failure +[h264 @ 0x55d95c76c1c0] mmco: unref short failure +[h264 @ 0x555dee1d4480] mmco: unref short failure +[h264 @ 0x555dee1d4480] mmco: unref short failure +[h264 @ 0x55d95c76c1c0] mmco: unref short failure +[h264 @ 0x55d95c76c1c0] mmco: unref short failure +[h264 @ 0x555dee1d4480] mmco: unref short failure +[h264 @ 0x555dee1d4480] mmco: unref short failure +[h264 @ 0x555dec6aab00] mmco: unref short failure +[h264 @ 0x555dec6aab00] mmco: unref short failure +[h264 @ 0x55d955d24640] mmco: unref short failure +[h264 @ 0x55d955d24640] mmco: unref short failure +[h264 @ 0x555dec6aab00] mmco: unref short failure +[h264 @ 0x55d955d24640] mmco: unref short failure +[h264 @ 0x555ded624e00] mmco: unref short failure +[h264 @ 0x55d95707f900] mmco: unref short failure + [2024-11-28 03:19:54] iteration 551/ 1000 | consumed samples: 35264 | elapsed time per iteration (ms): 79527.9 | throughput per GPU (TFLOP/s/GPU): 96.9 | learning rate: 2.264989E-06 | global batch size: 64 | lm loss: 7.161276E-01 | loss scale: 1.0 | grad norm: 1.478 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x55d957f0f640] mmco: unref short failure + [2024-11-28 03:21:09] iteration 552/ 1000 | consumed samples: 35328 | elapsed time per iteration (ms): 75060.8 | throughput per GPU (TFLOP/s/GPU): 102.7 | learning rate: 2.257109E-06 | global batch size: 64 | lm loss: 6.593223E-01 | loss scale: 1.0 | grad norm: 0.845 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x55d957f0f640] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x55d957f0f640] mmco: unref short failure +[h264 @ 0x55d957f0f640] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x55d957f0f640] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x55d956fbbd80] mmco: unref short failure +[h264 @ 0x55d956fbbd80] mmco: unref short failure +[h264 @ 0x555dee700e80] mmco: unref short failure +[h264 @ 0x555dee700e80] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x55d955d4f640] mmco: unref short failure +[h264 @ 0x55d955d4f640] mmco: unref short failure +[h264 @ 0x55d955d4f640] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x55d955d4f640] mmco: unref short failure +[h264 @ 0x55d955d4f640] mmco: unref short failure +[h264 @ 0x555dece9f180] mmco: unref short failure +[h264 @ 0x555dece9f180] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x555dee1d4480] mmco: unref short failure +[h264 @ 0x555dee1d4480] mmco: unref short failure +[h264 @ 0x55d95707f900] mmco: unref short failure +[h264 @ 0x55d95707f900] mmco: unref short failure + [2024-11-28 03:22:35] iteration 553/ 1000 | consumed samples: 35392 | elapsed time per iteration (ms): 86407.9 | throughput per GPU (TFLOP/s/GPU): 89.2 | learning rate: 2.249233E-06 | global batch size: 64 | lm loss: 6.227847E-01 | loss scale: 1.0 | grad norm: 0.974 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555dec6aab00] mmco: unref short failure +[h264 @ 0x555dec6aab00] mmco: unref short failure +[h264 @ 0x555dee1d4480] mmco: unref short failure +[h264 @ 0x555dec6aab00] mmco: unref short failure +[h264 @ 0x555dec6aab00] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x55d9581bc7c0] mmco: unref short failure +[h264 @ 0x55d9581bc7c0] mmco: unref short failure +[h264 @ 0x55d955d4f640] mmco: unref short failure +[h264 @ 0x55d95707f900] mmco: unref short failure +[h264 @ 0x55d9581bc7c0] mmco: unref short failure +[h264 @ 0x55d9581bc7c0] mmco: unref short failure +[h264 @ 0x555ded624e00] mmco: unref short failure +[h264 @ 0x555ded624e00] mmco: unref short failure +[h264 @ 0x55d95707f900] mmco: unref short failure +[h264 @ 0x55d95707f900] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x55d955d4f640] mmco: unref short failure + [2024-11-28 03:23:53] iteration 554/ 1000 | consumed samples: 35456 | elapsed time per iteration (ms): 77190.3 | throughput per GPU (TFLOP/s/GPU): 99.9 | learning rate: 2.241359E-06 | global batch size: 64 | lm loss: 6.859035E-01 | loss scale: 1.0 | grad norm: 9.515 | number of skipped iterations: 0 | number of nan iterations: 0 | +processed_samples 2000 unjoint_samples 2000 joint_samples 139 [104870, 75543] +processed_samples 2000 unjoint_samples 2000 joint_samples 134 [82659, 118848] +processed_samples 2000 unjoint_samples 2000 joint_samples 139 [104870, 75543] +processed_samples 2000 unjoint_samples 2000 joint_samples 134 [82659, 118848] +processed_samples 2000 unjoint_samples 2000 joint_samples 132 [78151, 115723] +processed_samples 2000 unjoint_samples 2000 joint_samples 132 [78151, 115723] +[h264 @ 0x555dee100dc0] mmco: unref short failure +[h264 @ 0x555dee100dc0] mmco: unref short failure +[h264 @ 0x55d956d792c0] mmco: unref short failure +[h264 @ 0x55d956d792c0] mmco: unref short failure +[h264 @ 0x555dee100dc0] mmco: unref short failure +[h264 @ 0x555dee100dc0] mmco: unref short failure +[h264 @ 0x55d956d792c0] mmco: unref short failure +[h264 @ 0x55d956d792c0] mmco: unref short failure +[h264 @ 0x555dee100dc0] mmco: unref short failure +[h264 @ 0x55d956d792c0] mmco: unref short failure +[h264 @ 0x555dee100dc0] mmco: unref short failure +[h264 @ 0x55d956d792c0] mmco: unref short failure +[h264 @ 0x555dee100dc0] mmco: unref short failure +[h264 @ 0x55d956d792c0] mmco: unref short failure +processed_samples 2000 unjoint_samples 2000 joint_samples 137 [102192, 79149] +processed_samples 2000 unjoint_samples 2000 joint_samples 137 [102192, 79149] + [2024-11-28 03:25:10] iteration 555/ 1000 | consumed samples: 35520 | elapsed time per iteration (ms): 76910.6 | throughput per GPU (TFLOP/s/GPU): 100.2 | learning rate: 2.233489E-06 | global batch size: 64 | lm loss: 6.369921E-01 | loss scale: 1.0 | grad norm: 1.142 | number of skipped iterations: 0 | number of nan iterations: 0 | +processed_samples 2000 unjoint_samples 2000 joint_samples 146 [111400, 83335] +processed_samples 2000 unjoint_samples 2000 joint_samples 146 [111400, 83335] +processed_samples 2000 unjoint_samples 2000 joint_samples 134 [114848, 116982] +[h264 @ 0x555dece9f180] mmco: unref short failure +[h264 @ 0x555dece9f180] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +processed_samples 2000 unjoint_samples 2000 joint_samples 134 [114848, 116982] +processed_samples 2000 unjoint_samples 2000 joint_samples 146 [122504, 130548] +[h264 @ 0x555dece9f180] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +processed_samples 2000 unjoint_samples 2000 joint_samples 146 [122504, 130548] +[h264 @ 0x555dece9f180] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x55d955d4f640] mmco: unref short failure +[h264 @ 0x55d955d4f640] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x555dece9f180] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x555dece9f180] mmco: unref short failure +[h264 @ 0x555dece9f180] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x555dece9f180] mmco: unref short failure +[h264 @ 0x555dece9f180] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x555dece9f180] mmco: unref short failure +[h264 @ 0x555dece9f180] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x555dece9f180] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x555dece9f180] mmco: unref short failure +[h264 @ 0x555dece9f180] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +processed_samples 2000 unjoint_samples 2000 joint_samples 143 [38121, 93882] +processed_samples 2000 unjoint_samples 2000 joint_samples 143 [38121, 93882] + [2024-11-28 03:26:30] iteration 556/ 1000 | consumed samples: 35584 | elapsed time per iteration (ms): 80902.6 | throughput per GPU (TFLOP/s/GPU): 95.3 | learning rate: 2.225622E-06 | global batch size: 64 | lm loss: 6.364502E-01 | loss scale: 1.0 | grad norm: 12.179 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x55d956d792c0] mmco: unref short failure +[h264 @ 0x55d956d792c0] mmco: unref short failure +[h264 @ 0x555df32c1f40] mmco: unref short failure +[h264 @ 0x555df32c1f40] mmco: unref short failure +[h264 @ 0x555dee700e80] mmco: unref short failure +[h264 @ 0x55d956fbbd80] mmco: unref short failure +[h264 @ 0x555ded679600] mmco: unref short failure +[h264 @ 0x555ded679600] mmco: unref short failure +[h264 @ 0x55d956f36840] mmco: unref short failure +[h264 @ 0x55d956f36840] mmco: unref short failure +[h264 @ 0x55d955d53340] mmco: unref short failure +[h264 @ 0x555dedc5cf80] mmco: unref short failure +[h264 @ 0x55d955d53340] mmco: unref short failure +[h264 @ 0x55d955d53340] mmco: unref short failure +[h264 @ 0x555dedc5cf80] mmco: unref short failure +[h264 @ 0x555dedc5cf80] mmco: unref short failure + [2024-11-28 03:27:50] iteration 557/ 1000 | consumed samples: 35648 | elapsed time per iteration (ms): 79243.5 | throughput per GPU (TFLOP/s/GPU): 97.3 | learning rate: 2.217759E-06 | global batch size: 64 | lm loss: 6.272420E-01 | loss scale: 1.0 | grad norm: 0.966 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x55d955d4f640] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x55d955d4f640] mmco: unref short failure +[h264 @ 0x55d955d4f640] mmco: unref short failure + [2024-11-28 03:29:20] iteration 558/ 1000 | consumed samples: 35712 | elapsed time per iteration (ms): 90139.8 | throughput per GPU (TFLOP/s/GPU): 85.5 | learning rate: 2.209899E-06 | global batch size: 64 | lm loss: 6.555175E-01 | loss scale: 1.0 | grad norm: 1.026 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x55d956fbbd80] mmco: unref short failure +[h264 @ 0x55d956fbbd80] mmco: unref short failure +[h264 @ 0x555dee700e80] mmco: unref short failure +[h264 @ 0x555dee700e80] mmco: unref short failure +[h264 @ 0x55d955d4f640] mmco: unref short failure +[h264 @ 0x55d955d4f640] mmco: unref short failure +[h264 @ 0x55d955d4f640] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x55d956fbbd80] mmco: unref short failure +[h264 @ 0x555dee700e80] mmco: unref short failure +[h264 @ 0x55d956fbbd80] mmco: unref short failure +[h264 @ 0x55d956fbbd80] mmco: unref short failure +[h264 @ 0x555dee700e80] mmco: unref short failure +[h264 @ 0x555dee700e80] mmco: unref short failure +[h264 @ 0x555ded679600] mmco: unref short failure +[h264 @ 0x555ded679600] mmco: unref short failure +[h264 @ 0x55d956f36840] mmco: unref short failure +[h264 @ 0x55d956f36840] mmco: unref short failure +[h264 @ 0x555dec811980] mmco: unref short failure +[h264 @ 0x55d956767080] mmco: unref short failure + [2024-11-28 03:30:50] iteration 559/ 1000 | consumed samples: 35776 | elapsed time per iteration (ms): 89959.7 | throughput per GPU (TFLOP/s/GPU): 85.7 | learning rate: 2.202043E-06 | global batch size: 64 | lm loss: 6.715569E-01 | loss scale: 1.0 | grad norm: 0.927 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x55d956042040] mmco: unref short failure +[h264 @ 0x555dee570b40] mmco: unref short failure +[h264 @ 0x55d956042040] mmco: unref short failure +[h264 @ 0x55d956042040] mmco: unref short failure +[h264 @ 0x555dee570b40] mmco: unref short failure +[h264 @ 0x555dee570b40] mmco: unref short failure +[h264 @ 0x555dedc5cf80] mmco: unref short failure +[h264 @ 0x555dedc5cf80] mmco: unref short failure +[h264 @ 0x55d956ee5780] mmco: unref short failure +[h264 @ 0x55d956ee5780] mmco: unref short failure +[h264 @ 0x555dedc5cf80] mmco: unref short failure +[h264 @ 0x555dedc5cf80] mmco: unref short failure +[h264 @ 0x55d956ee5780] mmco: unref short failure +[h264 @ 0x55d956ee5780] mmco: unref short failure +[h264 @ 0x555dedc5cf80] mmco: unref short failure +[h264 @ 0x555dedc5cf80] mmco: unref short failure +[h264 @ 0x55d956ee5780] mmco: unref short failure +[h264 @ 0x55d956ee5780] mmco: unref short failure +[h264 @ 0x555dedc5cf80] mmco: unref short failure +[h264 @ 0x55d956ee5780] mmco: unref short failure + [2024-11-28 03:32:19] iteration 560/ 1000 | consumed samples: 35840 | elapsed time per iteration (ms): 89186.0 | throughput per GPU (TFLOP/s/GPU): 86.4 | learning rate: 2.194190E-06 | global batch size: 64 | lm loss: 7.095491E-01 | loss scale: 1.0 | grad norm: 0.880 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555dee700e80] mmco: unref short failure +[h264 @ 0x555dee700e80] mmco: unref short failure +[h264 @ 0x55d956fbbd80] mmco: unref short failure +[h264 @ 0x55d956fbbd80] mmco: unref short failure + [2024-11-28 03:33:50] iteration 561/ 1000 | consumed samples: 35904 | elapsed time per iteration (ms): 91058.7 | throughput per GPU (TFLOP/s/GPU): 84.7 | learning rate: 2.186341E-06 | global batch size: 64 | lm loss: 6.814739E-01 | loss scale: 1.0 | grad norm: 1.873 | number of skipped iterations: 0 | number of nan iterations: 0 | + [2024-11-28 03:35:29] iteration 562/ 1000 | consumed samples: 35968 | elapsed time per iteration (ms): 99208.0 | throughput per GPU (TFLOP/s/GPU): 77.7 | learning rate: 2.178496E-06 | global batch size: 64 | lm loss: 6.558337E-01 | loss scale: 1.0 | grad norm: 0.976 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x55d956ee5780] mmco: unref short failure +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x55d956ee5780] mmco: unref short failure +[h264 @ 0x55d956ee5780] mmco: unref short failure +[h264 @ 0x55d956fbbd80] mmco: unref short failure +[h264 @ 0x555dee700e80] mmco: unref short failure +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x55d95742d280] mmco: unref short failure +[h264 @ 0x55d95742d280] mmco: unref short failure +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x55d95742d280] mmco: unref short failure +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x55d95742d280] mmco: unref short failure +[h264 @ 0x55d95742d280] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x55d9559dfc40] mmco: unref short failure +[h264 @ 0x55d9559dfc40] mmco: unref short failure +[h264 @ 0x555dedc5cf80] mmco: unref short failure +[h264 @ 0x555dedc5cf80] mmco: unref short failure +[h264 @ 0x55d956ee5780] mmco: unref short failure +[h264 @ 0x55d956ee5780] mmco: unref short failure +[h264 @ 0x555ded679600] mmco: unref short failure +[h264 @ 0x55d956f36840] mmco: unref short failure +[h264 @ 0x555ded679600] mmco: unref short failure +[h264 @ 0x555ded679600] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x555dee0da840] mmco: unref short failure +[h264 @ 0x555dee0da840] mmco: unref short failure +[h264 @ 0x55d956f36840] mmco: unref short failure +[h264 @ 0x55d956f36840] mmco: unref short failure +[h264 @ 0x555ded679600] mmco: unref short failure +[h264 @ 0x555ded679600] mmco: unref short failure +[h264 @ 0x555ded679600] mmco: unref short failure +[h264 @ 0x555ded679600] mmco: unref short failure +[h264 @ 0x55d956f36840] mmco: unref short failure +[h264 @ 0x55d956f36840] mmco: unref short failure +[h264 @ 0x55d956f36840] mmco: unref short failure +[h264 @ 0x55d956f36840] mmco: unref short failure + [2024-11-28 03:37:21] iteration 563/ 1000 | consumed samples: 36032 | elapsed time per iteration (ms): 111663.2 | throughput per GPU (TFLOP/s/GPU): 69.0 | learning rate: 2.170655E-06 | global batch size: 64 | lm loss: 7.521163E-01 | loss scale: 1.0 | grad norm: 0.846 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555dece9f180] mmco: unref short failure +[h264 @ 0x555dece9f180] mmco: unref short failure +[h264 @ 0x555dece9f180] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x555dedc5cf80] mmco: unref short failure +[h264 @ 0x555dedc5cf80] mmco: unref short failure +[h264 @ 0x55d955d24640] mmco: unref short failure +[h264 @ 0x55d955d24640] mmco: unref short failure +[h264 @ 0x555dece9f180] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure + [2024-11-28 03:38:40] iteration 564/ 1000 | consumed samples: 36096 | elapsed time per iteration (ms): 78978.4 | throughput per GPU (TFLOP/s/GPU): 97.6 | learning rate: 2.162817E-06 | global batch size: 64 | lm loss: 6.506171E-01 | loss scale: 1.0 | grad norm: 0.910 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x55d95c76fac0] mmco: unref short failure +[h264 @ 0x55d95c76fac0] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x55d95c76fac0] mmco: unref short failure + [2024-11-28 03:39:50] iteration 565/ 1000 | consumed samples: 36160 | elapsed time per iteration (ms): 70465.5 | throughput per GPU (TFLOP/s/GPU): 109.4 | learning rate: 2.154984E-06 | global batch size: 64 | lm loss: 6.828893E-01 | loss scale: 1.0 | grad norm: 0.920 | number of skipped iterations: 0 | number of nan iterations: 0 | + [2024-11-28 03:42:24] iteration 566/ 1000 | consumed samples: 36224 | elapsed time per iteration (ms): 153198.9 | throughput per GPU (TFLOP/s/GPU): 50.3 | learning rate: 2.147155E-06 | global batch size: 64 | lm loss: 6.904967E-01 | loss scale: 1.0 | grad norm: 0.966 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x555dece9f180] mmco: unref short failure +[h264 @ 0x555dec811980] mmco: unref short failure +[h264 @ 0x55d956b88f40] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x555dece9f180] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x555dece9f180] mmco: unref short failure +[h264 @ 0x555dece9f180] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x555dece9f180] mmco: unref short failure +[h264 @ 0x555dece9f180] mmco: unref short failure +[h264 @ 0x555dece9f180] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x555dece9f180] mmco: unref short failure +[h264 @ 0x555dece9f180] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x55d957a03080] mmco: unref short failure + [2024-11-28 03:44:00] iteration 567/ 1000 | consumed samples: 36288 | elapsed time per iteration (ms): 95808.9 | throughput per GPU (TFLOP/s/GPU): 80.5 | learning rate: 2.139330E-06 | global batch size: 64 | lm loss: 6.944407E-01 | loss scale: 1.0 | grad norm: 1.575 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x55d956042040] mmco: unref short failure +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x55d956042040] mmco: unref short failure +[h264 @ 0x55d956042040] mmco: unref short failure +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x55d956042040] mmco: unref short failure +[h264 @ 0x55d956042040] mmco: unref short failure +[h264 @ 0x555deebbd8c0] mmco: unref short failure +[h264 @ 0x55d9567ebec0] mmco: unref short failure + [2024-11-28 03:45:30] iteration 568/ 1000 | consumed samples: 36352 | elapsed time per iteration (ms): 90076.5 | throughput per GPU (TFLOP/s/GPU): 85.6 | learning rate: 2.131510E-06 | global batch size: 64 | lm loss: 6.663601E-01 | loss scale: 1.0 | grad norm: 0.754 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555dece9f180] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x555dece9f180] mmco: unref short failure +[h264 @ 0x555dece9f180] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure + [2024-11-28 03:47:43] iteration 569/ 1000 | consumed samples: 36416 | elapsed time per iteration (ms): 133694.0 | throughput per GPU (TFLOP/s/GPU): 57.7 | learning rate: 2.123694E-06 | global batch size: 64 | lm loss: 6.749229E-01 | loss scale: 1.0 | grad norm: 0.771 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x55d957a03080] mmco: unref short failure +[h264 @ 0x55d957a03080] mmco: unref short failure +[h264 @ 0x555dec811980] mmco: unref short failure +[h264 @ 0x555dec811980] mmco: unref short failure +[h264 @ 0x555dec811980] mmco: unref short failure +[h264 @ 0x555dec811980] mmco: unref short failure +[h264 @ 0x555dec811980] mmco: unref short failure +[h264 @ 0x555dec811980] mmco: unref short failure +[h264 @ 0x555dec811980] mmco: unref short failure +[h264 @ 0x555dec811980] mmco: unref short failure +[h264 @ 0x555dec811980] mmco: unref short failure +[h264 @ 0x55d956f7b580] mmco: unref short failure +[h264 @ 0x55d956f7b580] mmco: unref short failure +[h264 @ 0x55d956f7b580] mmco: unref short failure +[h264 @ 0x555dece9f180] mmco: unref short failure +[h264 @ 0x55d956f7b580] mmco: unref short failure +[h264 @ 0x55d956f7b580] mmco: unref short failure +[h264 @ 0x555dedc5cf80] mmco: unref short failure +[h264 @ 0x55d956f7b580] mmco: unref short failure +[h264 @ 0x55d956f7b580] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x55d956d792c0] mmco: unref short failure + [2024-11-28 03:49:26] iteration 570/ 1000 | consumed samples: 36480 | elapsed time per iteration (ms): 103054.1 | throughput per GPU (TFLOP/s/GPU): 74.8 | learning rate: 2.115882E-06 | global batch size: 64 | lm loss: 6.709623E-01 | loss scale: 1.0 | grad norm: 0.804 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555dec811980] mmco: unref short failure +[h264 @ 0x555dec811980] mmco: unref short failure +[h264 @ 0x555dec811980] mmco: unref short failure +[h264 @ 0x55d956f7b580] mmco: unref short failure +[h264 @ 0x55d956f7b580] mmco: unref short failure +[h264 @ 0x55d956f7b580] mmco: unref short failure +[h264 @ 0x555dec811980] mmco: unref short failure +[h264 @ 0x55d956f7b580] mmco: unref short failure +[h264 @ 0x555dec811980] mmco: unref short failure +[h264 @ 0x55d956f7b580] mmco: unref short failure +[h264 @ 0x555dedc5cf80] mmco: unref short failure +[h264 @ 0x55d955d53340] mmco: unref short failure +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x55d9559dfc40] mmco: unref short failure +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x55d9559dfc40] mmco: unref short failure +[h264 @ 0x555def216fc0] mmco: unref short failure +[h264 @ 0x55d95873d100] mmco: unref short failure + [2024-11-28 03:50:43] iteration 571/ 1000 | consumed samples: 36544 | elapsed time per iteration (ms): 76619.1 | throughput per GPU (TFLOP/s/GPU): 100.6 | learning rate: 2.108075E-06 | global batch size: 64 | lm loss: 6.776207E-01 | loss scale: 1.0 | grad norm: 0.946 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555dee588f00] mmco: unref short failure +[h264 @ 0x555dee588f00] mmco: unref short failure +[h264 @ 0x55d955d53340] mmco: unref short failure +[h264 @ 0x55d955d53340] mmco: unref short failure +[h264 @ 0x555dece9f180] mmco: unref short failure +[h264 @ 0x555dece9f180] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x555dee588f00] mmco: unref short failure +[h264 @ 0x55d955d53340] mmco: unref short failure +[h264 @ 0x555de1b1b9c0] mmco: unref short failure +[h264 @ 0x555de1b1b9c0] mmco: unref short failure +[h264 @ 0x555dee588f00] mmco: unref short failure +[h264 @ 0x555dee588f00] mmco: unref short failure +[h264 @ 0x555dee588f00] mmco: unref short failure +[h264 @ 0x55d956b4f800] mmco: unref short failure +[h264 @ 0x55d956b4f800] mmco: unref short failure +[h264 @ 0x555de1b1b9c0] mmco: unref short failure +[h264 @ 0x555de1b1b9c0] mmco: unref short failure +[h264 @ 0x55d955d53340] mmco: unref short failure +[h264 @ 0x55d955d53340] mmco: unref short failure +[h264 @ 0x55d955d53340] mmco: unref short failure +[h264 @ 0x55d956b4f800] mmco: unref short failure +[h264 @ 0x55d956b4f800] mmco: unref short failure + [2024-11-28 03:52:25] iteration 572/ 1000 | consumed samples: 36608 | elapsed time per iteration (ms): 101916.4 | throughput per GPU (TFLOP/s/GPU): 75.6 | learning rate: 2.100273E-06 | global batch size: 64 | lm loss: 6.679552E-01 | loss scale: 1.0 | grad norm: 0.876 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555dece9f180] mmco: unref short failure +[h264 @ 0x555dece9f180] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x555dece9f180] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x555dece9f180] mmco: unref short failure +[h264 @ 0x555dece9f180] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x555dece9f180] mmco: unref short failure +[h264 @ 0x555dece9f180] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x555dece9f180] mmco: unref short failure +[h264 @ 0x555dece9f180] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x555deee52400] mmco: unref short failure +[h264 @ 0x555deee52400] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x555dece9f180] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x555dece9f180] mmco: unref short failure +[h264 @ 0x555dece9f180] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x55d9559dfc40] mmco: unref short failure +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x55d9559dfc40] mmco: unref short failure +[h264 @ 0x555dece9f180] mmco: unref short failure +[h264 @ 0x555dece9f180] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x555dece9f180] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure + [2024-11-28 03:53:45] iteration 573/ 1000 | consumed samples: 36672 | elapsed time per iteration (ms): 79866.7 | throughput per GPU (TFLOP/s/GPU): 96.5 | learning rate: 2.092475E-06 | global batch size: 64 | lm loss: 6.936182E-01 | loss scale: 1.0 | grad norm: 0.842 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555dece9f180] mmco: unref short failure +[h264 @ 0x555dece9f180] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x555dece9f180] mmco: unref short failure +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x55d9559dfc40] mmco: unref short failure +[h264 @ 0x55d9559dfc40] mmco: unref short failure +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x55d9559dfc40] mmco: unref short failure +[h264 @ 0x55d9559dfc40] mmco: unref short failure +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x55d9559dfc40] mmco: unref short failure +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x55d9559dfc40] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x555dece9f180] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x555dece9f180] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x555dece9f180] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x555dece9f180] mmco: unref short failure +[h264 @ 0x555dece9f180] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x555dece9f180] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x555dece9f180] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x555dece9f180] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x555dece9f180] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x555dece9f180] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x555dece9f180] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x555dece9f180] mmco: unref short failure +[h264 @ 0x555dece9f180] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x555dece9f180] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x555dece9f180] mmco: unref short failure +[h264 @ 0x555dece9f180] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x555dece9f180] mmco: unref short failure +[h264 @ 0x555dece9f180] mmco: unref short failure +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x55d9559dfc40] mmco: unref short failure +[h264 @ 0x55d9559dfc40] mmco: unref short failure +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x55d9559dfc40] mmco: unref short failure +[h264 @ 0x55d9559dfc40] mmco: unref short failure +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x55d9559dfc40] mmco: unref short failure +[h264 @ 0x55d9559dfc40] mmco: unref short failure + [2024-11-28 03:55:21] iteration 574/ 1000 | consumed samples: 36736 | elapsed time per iteration (ms): 96071.2 | throughput per GPU (TFLOP/s/GPU): 80.2 | learning rate: 2.084682E-06 | global batch size: 64 | lm loss: 6.289401E-01 | loss scale: 1.0 | grad norm: 1.045 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x55d9559dfc40] mmco: unref short failure +[h264 @ 0x55d9559dfc40] mmco: unref short failure +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x55d955d53340] mmco: unref short failure +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x55d9559dfc40] mmco: unref short failure +[h264 @ 0x55d9559dfc40] mmco: unref short failure +[h264 @ 0x55d9559dfc40] mmco: unref short failure + [2024-11-28 03:56:43] iteration 575/ 1000 | consumed samples: 36800 | elapsed time per iteration (ms): 82515.1 | throughput per GPU (TFLOP/s/GPU): 93.4 | learning rate: 2.076894E-06 | global batch size: 64 | lm loss: 6.630487E-01 | loss scale: 1.0 | grad norm: 0.803 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555def216fc0] mmco: unref short failure +[h264 @ 0x555def216fc0] mmco: unref short failure +[h264 @ 0x55d956f36840] mmco: unref short failure +[h264 @ 0x55d956f36840] mmco: unref short failure + [2024-11-28 03:58:24] iteration 576/ 1000 | consumed samples: 36864 | elapsed time per iteration (ms): 100449.7 | throughput per GPU (TFLOP/s/GPU): 76.7 | learning rate: 2.069111E-06 | global batch size: 64 | lm loss: 7.089751E-01 | loss scale: 1.0 | grad norm: 0.920 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x55d9559dfc40] mmco: unref short failure +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x55d9559dfc40] mmco: unref short failure +[h264 @ 0x555df4169bc0] mmco: unref short failure + [2024-11-28 03:59:46] iteration 577/ 1000 | consumed samples: 36928 | elapsed time per iteration (ms): 81737.9 | throughput per GPU (TFLOP/s/GPU): 94.3 | learning rate: 2.061333E-06 | global batch size: 64 | lm loss: 6.779444E-01 | loss scale: 1.0 | grad norm: 0.950 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x55d957fe7780] mmco: unref short failure +[h264 @ 0x55d957fe7780] mmco: unref short failure +[h264 @ 0x555de1b1b9c0] mmco: unref short failure +[h264 @ 0x555de1b1b9c0] mmco: unref short failure + [2024-11-28 04:01:12] iteration 578/ 1000 | consumed samples: 36992 | elapsed time per iteration (ms): 85889.6 | throughput per GPU (TFLOP/s/GPU): 89.7 | learning rate: 2.053560E-06 | global batch size: 64 | lm loss: 6.224477E-01 | loss scale: 1.0 | grad norm: 0.864 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x55d955d53340] mmco: unref short failure +[h264 @ 0x55d956defa80] mmco: unref short failure +[h264 @ 0x55d956defa80] mmco: unref short failure +[h264 @ 0x555deee52400] mmco: unref short failure +[h264 @ 0x555deee52400] mmco: unref short failure +[h264 @ 0x555dee588f00] mmco: unref short failure +[h264 @ 0x555deee52400] Missing reference picture, default is 65530 +[h264 @ 0x555deee52400] mmco: unref short failure +[h264 @ 0x555deee52400] mmco: unref short failure +[h264 @ 0x55d956defa80] Missing reference picture, default is 65530 +[h264 @ 0x55d956defa80] mmco: unref short failure +[h264 @ 0x55d956defa80] mmco: unref short failure +[h264 @ 0x555deee52400] Missing reference picture, default is 65530 +[h264 @ 0x555deee52400] mmco: unref short failure +[h264 @ 0x555deee52400] mmco: unref short failure +[h264 @ 0x55d956defa80] Missing reference picture, default is 65530 +[h264 @ 0x55d956defa80] mmco: unref short failure +[h264 @ 0x55d956defa80] mmco: unref short failure + [2024-11-28 04:02:38] iteration 579/ 1000 | consumed samples: 37056 | elapsed time per iteration (ms): 86490.3 | throughput per GPU (TFLOP/s/GPU): 89.1 | learning rate: 2.045792E-06 | global batch size: 64 | lm loss: 7.127038E-01 | loss scale: 1.0 | grad norm: 1.609 | number of skipped iterations: 0 | number of nan iterations: 0 | + [2024-11-28 04:03:48] iteration 580/ 1000 | consumed samples: 37120 | elapsed time per iteration (ms): 69500.1 | throughput per GPU (TFLOP/s/GPU): 110.9 | learning rate: 2.038029E-06 | global batch size: 64 | lm loss: 6.738371E-01 | loss scale: 1.0 | grad norm: 1.019 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555df0f7cd00] mmco: unref short failure +[h264 @ 0x555df0f7cd00] mmco: unref short failure +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x55d9568d0900] mmco: unref short failure +[h264 @ 0x55d9568d0900] mmco: unref short failure +[h264 @ 0x555df0f7cd00] mmco: unref short failure +[h264 @ 0x55d9559dfc40] mmco: unref short failure +[h264 @ 0x55d9559dfc40] mmco: unref short failure +[h264 @ 0x55d9568d0900] mmco: unref short failure +[h264 @ 0x555de1b1b9c0] mmco: unref short failure +[h264 @ 0x55d957fe7780] mmco: unref short failure +[h264 @ 0x555de1b1b9c0] mmco: unref short failure +[h264 @ 0x555de1b1b9c0] mmco: unref short failure +[h264 @ 0x555de1b1b9c0] mmco: unref short failure +[h264 @ 0x55d957fe7780] mmco: unref short failure +[h264 @ 0x55d957fe7780] mmco: unref short failure +[h264 @ 0x55d957fe7780] mmco: unref short failure +[h264 @ 0x555de1b1b9c0] mmco: unref short failure +[h264 @ 0x55d957fe7780] mmco: unref short failure +[h264 @ 0x555de1b1b9c0] mmco: unref short failure +[h264 @ 0x555de1b1b9c0] mmco: unref short failure +[h264 @ 0x555de1b1b9c0] mmco: unref short failure +[h264 @ 0x55d957fe7780] mmco: unref short failure +[h264 @ 0x55d957fe7780] mmco: unref short failure +[h264 @ 0x55d957fe7780] mmco: unref short failure +[h264 @ 0x555de1b1b9c0] mmco: unref short failure +[h264 @ 0x55d957fe7780] mmco: unref short failure +[h264 @ 0x555dedacf800] mmco: unref short failure +[h264 @ 0x555dedacf800] mmco: unref short failure +[h264 @ 0x55d9568d0900] mmco: unref short failure +[h264 @ 0x55d9568d0900] mmco: unref short failure + [2024-11-28 04:04:54] iteration 581/ 1000 | consumed samples: 37184 | elapsed time per iteration (ms): 66004.8 | throughput per GPU (TFLOP/s/GPU): 116.8 | learning rate: 2.030272E-06 | global batch size: 64 | lm loss: 7.272795E-01 | loss scale: 1.0 | grad norm: 1.192 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555dee588f00] mmco: unref short failure +[h264 @ 0x55d955d53340] mmco: unref short failure + [2024-11-28 04:06:33] iteration 582/ 1000 | consumed samples: 37248 | elapsed time per iteration (ms): 99201.2 | throughput per GPU (TFLOP/s/GPU): 77.7 | learning rate: 2.022521E-06 | global batch size: 64 | lm loss: 7.001043E-01 | loss scale: 1.0 | grad norm: 0.864 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x55d95678a600] mmco: unref short failure +[h264 @ 0x55d95678a600] mmco: unref short failure +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x555def216fc0] mmco: unref short failure +[h264 @ 0x555def216fc0] mmco: unref short failure +[h264 @ 0x55d956f36840] mmco: unref short failure +[h264 @ 0x55d956f36840] mmco: unref short failure +[h264 @ 0x55d95678a600] mmco: unref short failure +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x55d956f36840] mmco: unref short failure +[h264 @ 0x555def216fc0] mmco: unref short failure +[h264 @ 0x55d95678a600] mmco: unref short failure +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x555def216fc0] mmco: unref short failure +[h264 @ 0x555def216fc0] mmco: unref short failure +[h264 @ 0x55d956f36840] mmco: unref short failure +[h264 @ 0x55d956f36840] mmco: unref short failure +[h264 @ 0x555def216fc0] mmco: unref short failure +[h264 @ 0x555def216fc0] mmco: unref short failure +[h264 @ 0x55d956f36840] mmco: unref short failure +[h264 @ 0x55d956f36840] mmco: unref short failure +[h264 @ 0x555def216fc0] mmco: unref short failure +[h264 @ 0x55d956f36840] mmco: unref short failure +[h264 @ 0x55d95678a600] mmco: unref short failure +[h264 @ 0x55d95678a600] mmco: unref short failure +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x55d95678a600] mmco: unref short failure +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x55d95678a600] mmco: unref short failure +[h264 @ 0x55d95678a600] mmco: unref short failure +[h264 @ 0x555dedbd1740] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure + [2024-11-28 04:07:49] iteration 583/ 1000 | consumed samples: 37312 | elapsed time per iteration (ms): 76158.1 | throughput per GPU (TFLOP/s/GPU): 101.2 | learning rate: 2.014775E-06 | global batch size: 64 | lm loss: 6.584710E-01 | loss scale: 1.0 | grad norm: 0.949 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555dedbd1740] mmco: unref short failure +[h264 @ 0x555dedbd1740] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure + [2024-11-28 04:09:29] iteration 584/ 1000 | consumed samples: 37376 | elapsed time per iteration (ms): 100293.9 | throughput per GPU (TFLOP/s/GPU): 76.9 | learning rate: 2.007034E-06 | global batch size: 64 | lm loss: 7.019646E-01 | loss scale: 1.0 | grad norm: 1.057 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555def216fc0] mmco: unref short failure +[h264 @ 0x555def216fc0] mmco: unref short failure +[h264 @ 0x55d956f36840] mmco: unref short failure +[h264 @ 0x55d956f36840] mmco: unref short failure +[h264 @ 0x555dedbd1740] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x555dedbd1740] mmco: unref short failure +[h264 @ 0x555dedbd1740] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x555deda7a5c0] mmco: unref short failure +[h264 @ 0x55d95678a600] mmco: unref short failure + [2024-11-28 04:11:02] iteration 585/ 1000 | consumed samples: 37440 | elapsed time per iteration (ms): 92320.3 | throughput per GPU (TFLOP/s/GPU): 83.5 | learning rate: 1.999299E-06 | global batch size: 64 | lm loss: 6.701733E-01 | loss scale: 1.0 | grad norm: 1.212 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555def216fc0] mmco: unref short failure +[h264 @ 0x55d956f36840] mmco: unref short failure + [2024-11-28 04:12:17] iteration 586/ 1000 | consumed samples: 37504 | elapsed time per iteration (ms): 75791.0 | throughput per GPU (TFLOP/s/GPU): 101.7 | learning rate: 1.991570E-06 | global batch size: 64 | lm loss: 6.158227E-01 | loss scale: 1.0 | grad norm: 0.814 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555def216fc0] mmco: unref short failure +[h264 @ 0x555def216fc0] mmco: unref short failure +[h264 @ 0x55d956f36840] mmco: unref short failure +[h264 @ 0x55d956f36840] mmco: unref short failure +[h264 @ 0x555def216fc0] mmco: unref short failure +[h264 @ 0x555def216fc0] mmco: unref short failure +[h264 @ 0x55d956f36840] mmco: unref short failure +[h264 @ 0x55d956f36840] mmco: unref short failure +[h264 @ 0x555def216fc0] mmco: unref short failure +[h264 @ 0x555def216fc0] mmco: unref short failure +[h264 @ 0x55d956f36840] mmco: unref short failure +[h264 @ 0x55d956f36840] mmco: unref short failure + [2024-11-28 04:14:01] iteration 587/ 1000 | consumed samples: 37568 | elapsed time per iteration (ms): 103921.2 | throughput per GPU (TFLOP/s/GPU): 74.2 | learning rate: 1.983847E-06 | global batch size: 64 | lm loss: 7.956023E-01 | loss scale: 1.0 | grad norm: 0.955 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x55d959cd93c0] mmco: unref short failure +[h264 @ 0x55d959cd93c0] mmco: unref short failure +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x55d959cd93c0] mmco: unref short failure + [2024-11-28 04:15:38] iteration 588/ 1000 | consumed samples: 37632 | elapsed time per iteration (ms): 96704.1 | throughput per GPU (TFLOP/s/GPU): 79.7 | learning rate: 1.976130E-06 | global batch size: 64 | lm loss: 6.003830E-01 | loss scale: 1.0 | grad norm: 0.940 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x55d955e77900] mmco: unref short failure +[h264 @ 0x55d955e77900] mmco: unref short failure + [2024-11-28 04:16:57] iteration 589/ 1000 | consumed samples: 37696 | elapsed time per iteration (ms): 78610.3 | throughput per GPU (TFLOP/s/GPU): 98.1 | learning rate: 1.968419E-06 | global batch size: 64 | lm loss: 6.652268E-01 | loss scale: 1.0 | grad norm: 0.818 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x55d955e77900] mmco: unref short failure +[h264 @ 0x55d955e77900] mmco: unref short failure +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x555dedf05880] mmco: unref short failure +[h264 @ 0x55d956d792c0] mmco: unref short failure +[h264 @ 0x555dedbd1740] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x555dedbd1740] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure + [2024-11-28 04:18:16] iteration 590/ 1000 | consumed samples: 37760 | elapsed time per iteration (ms): 79304.7 | throughput per GPU (TFLOP/s/GPU): 97.2 | learning rate: 1.960714E-06 | global batch size: 64 | lm loss: 6.871901E-01 | loss scale: 1.0 | grad norm: 0.875 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x55d957a03080] mmco: unref short failure +[h264 @ 0x55d957a03080] mmco: unref short failure +[h264 @ 0x555dece4d600] mmco: unref short failure +[h264 @ 0x555dece4d600] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x55d9559dfc40] mmco: unref short failure +[h264 @ 0x55d9559dfc40] mmco: unref short failure +[h264 @ 0x55d957fe7780] mmco: unref short failure +[h264 @ 0x555deee52400] mmco: unref short failure + [2024-11-28 04:19:46] iteration 591/ 1000 | consumed samples: 37824 | elapsed time per iteration (ms): 89563.2 | throughput per GPU (TFLOP/s/GPU): 86.1 | learning rate: 1.953015E-06 | global batch size: 64 | lm loss: 6.808087E-01 | loss scale: 1.0 | grad norm: 0.825 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555df0f7cd00] mmco: unref short failure +[h264 @ 0x55d9559dfc40] mmco: unref short failure +[h264 @ 0x55d95b718d00] mmco: unref short failure +[h264 @ 0x55d95b718d00] mmco: unref short failure +[h264 @ 0x555df32c1f40] mmco: unref short failure +[h264 @ 0x555df32c1f40] mmco: unref short failure +[h264 @ 0x55d95b718d00] mmco: unref short failure +[h264 @ 0x555df32c1f40] mmco: unref short failure +[h264 @ 0x55d95b718d00] mmco: unref short failure +[h264 @ 0x55d95b718d00] mmco: unref short failure +[h264 @ 0x555df32c1f40] mmco: unref short failure +[h264 @ 0x555df32c1f40] mmco: unref short failure +[h264 @ 0x55d95b718d00] mmco: unref short failure +[h264 @ 0x555df32c1f40] mmco: unref short failure + [2024-11-28 04:21:19] iteration 592/ 1000 | consumed samples: 37888 | elapsed time per iteration (ms): 93821.8 | throughput per GPU (TFLOP/s/GPU): 82.2 | learning rate: 1.945322E-06 | global batch size: 64 | lm loss: 6.263063E-01 | loss scale: 1.0 | grad norm: 0.886 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555dee104680] mmco: unref short failure +[h264 @ 0x555dee104680] mmco: unref short failure +[h264 @ 0x55d956386f00] mmco: unref short failure +[h264 @ 0x55d956386f00] mmco: unref short failure +[h264 @ 0x555dee588f00] mmco: unref short failure +[h264 @ 0x555dee588f00] mmco: unref short failure +[h264 @ 0x55d955d53340] mmco: unref short failure +[h264 @ 0x55d955d53340] mmco: unref short failure + [2024-11-28 04:22:30] iteration 593/ 1000 | consumed samples: 37952 | elapsed time per iteration (ms): 70646.2 | throughput per GPU (TFLOP/s/GPU): 109.1 | learning rate: 1.937636E-06 | global batch size: 64 | lm loss: 6.241408E-01 | loss scale: 1.0 | grad norm: 0.808 | number of skipped iterations: 0 | number of nan iterations: 0 | + [2024-11-28 04:23:56] iteration 594/ 1000 | consumed samples: 38016 | elapsed time per iteration (ms): 85648.4 | throughput per GPU (TFLOP/s/GPU): 90.0 | learning rate: 1.929956E-06 | global batch size: 64 | lm loss: 7.639532E-01 | loss scale: 1.0 | grad norm: 0.980 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555df32c1f40] mmco: unref short failure +[h264 @ 0x55d95b718d00] mmco: unref short failure + [2024-11-28 04:25:13] iteration 595/ 1000 | consumed samples: 38080 | elapsed time per iteration (ms): 76932.8 | throughput per GPU (TFLOP/s/GPU): 100.2 | learning rate: 1.922283E-06 | global batch size: 64 | lm loss: 6.555119E-01 | loss scale: 1.0 | grad norm: 1.175 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555dee588f00] mmco: unref short failure +[h264 @ 0x55d955d53340] mmco: unref short failure +[h264 @ 0x555decb36640] mmco: unref short failure +[h264 @ 0x555decb36640] mmco: unref short failure +[h264 @ 0x55d956b88f40] mmco: unref short failure +[h264 @ 0x55d956b88f40] mmco: unref short failure +[h264 @ 0x555dee0de680] mmco: unref short failure +[h264 @ 0x555dee0de680] mmco: unref short failure +[h264 @ 0x555dee0de680] mmco: unref short failure +[h264 @ 0x55d95b718d00] mmco: unref short failure +[h264 @ 0x55d95b718d00] mmco: unref short failure +[h264 @ 0x55d95b718d00] mmco: unref short failure +[h264 @ 0x555dee588f00] mmco: unref short failure +[h264 @ 0x555dee588f00] mmco: unref short failure +[h264 @ 0x55d955d53340] mmco: unref short failure +[h264 @ 0x55d955d53340] mmco: unref short failure + [2024-11-28 04:26:52] iteration 596/ 1000 | consumed samples: 38144 | elapsed time per iteration (ms): 99139.1 | throughput per GPU (TFLOP/s/GPU): 77.8 | learning rate: 1.914616E-06 | global batch size: 64 | lm loss: 6.388863E-01 | loss scale: 1.0 | grad norm: 0.967 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x55d957fbfd40] mmco: unref short failure +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x55d957fbfd40] mmco: unref short failure +[h264 @ 0x55d957fbfd40] mmco: unref short failure +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x55d957fbfd40] mmco: unref short failure +[h264 @ 0x55d957fbfd40] mmco: unref short failure +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x555dee700e80] mmco: unref short failure +[h264 @ 0x555dee700e80] mmco: unref short failure +[h264 @ 0x55d956f537c0] mmco: unref short failure +[h264 @ 0x55d956f537c0] mmco: unref short failure + [2024-11-28 04:28:04] iteration 597/ 1000 | consumed samples: 38208 | elapsed time per iteration (ms): 72491.0 | throughput per GPU (TFLOP/s/GPU): 106.3 | learning rate: 1.906956E-06 | global batch size: 64 | lm loss: 6.704025E-01 | loss scale: 1.0 | grad norm: 1.546 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x55d957fe7780] mmco: unref short failure +[h264 @ 0x555dee700e80] mmco: unref short failure +[h264 @ 0x55d955d53340] mmco: unref short failure +[h264 @ 0x55d955d53340] mmco: unref short failure +[h264 @ 0x555dee588f00] mmco: unref short failure +[h264 @ 0x555dee588f00] mmco: unref short failure +[h264 @ 0x55d955d53340] mmco: unref short failure +[h264 @ 0x555dee588f00] mmco: unref short failure +[h264 @ 0x555dee588f00] mmco: unref short failure +[h264 @ 0x555dee588f00] mmco: unref short failure +[h264 @ 0x55d955d53340] mmco: unref short failure +[h264 @ 0x55d955d53340] mmco: unref short failure + [2024-11-28 04:29:25] iteration 598/ 1000 | consumed samples: 38272 | elapsed time per iteration (ms): 80478.1 | throughput per GPU (TFLOP/s/GPU): 95.8 | learning rate: 1.899303E-06 | global batch size: 64 | lm loss: 6.136689E-01 | loss scale: 1.0 | grad norm: 1.109 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x55d956b88f40] mmco: unref short failure +[h264 @ 0x55d956b88f40] mmco: unref short failure +[h264 @ 0x555decb36640] mmco: unref short failure +[h264 @ 0x555decb36640] mmco: unref short failure +[h264 @ 0x555df32c1f40] mmco: unref short failure +[h264 @ 0x555df32c1f40] mmco: unref short failure +[h264 @ 0x55d95b718d00] mmco: unref short failure +[h264 @ 0x55d95b718d00] mmco: unref short failure +[h264 @ 0x55d956b88f40] mmco: unref short failure +[h264 @ 0x555decb36640] mmco: unref short failure +[h264 @ 0x55d956b88f40] mmco: unref short failure +[h264 @ 0x555decb36640] mmco: unref short failure + [2024-11-28 04:31:01] iteration 599/ 1000 | consumed samples: 38336 | elapsed time per iteration (ms): 96696.7 | throughput per GPU (TFLOP/s/GPU): 79.7 | learning rate: 1.891656E-06 | global batch size: 64 | lm loss: 6.841081E-01 | loss scale: 1.0 | grad norm: 0.911 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x55d957fbfd40] mmco: unref short failure +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x55d956aae040] mmco: unref short failure +[h264 @ 0x55d956aae040] mmco: unref short failure +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x55d9560849c0] mmco: unref short failure +[h264 @ 0x55d9560849c0] mmco: unref short failure + [2024-11-28 04:32:13] iteration 600/ 1000 | consumed samples: 38400 | elapsed time per iteration (ms): 71923.7 | throughput per GPU (TFLOP/s/GPU): 107.2 | learning rate: 1.884016E-06 | global batch size: 64 | lm loss: 6.775546E-01 | loss scale: 1.0 | grad norm: 0.955 | number of skipped iterations: 0 | number of nan iterations: 0 | +(min, max) time across ranks (ms): + save-checkpoint ................................: (145404.08, 145404.33) +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x55d9575fd4c0] mmco: unref short failure +[h264 @ 0x55d9575fd4c0] mmco: unref short failure +[h264 @ 0x55d9575fd4c0] mmco: unref short failure +[h264 @ 0x55d9575fd4c0] mmco: unref short failure + [2024-11-28 04:35:55] iteration 601/ 1000 | consumed samples: 38464 | elapsed time per iteration (ms): 76132.0 | throughput per GPU (TFLOP/s/GPU): 101.3 | learning rate: 1.876384E-06 | global batch size: 64 | lm loss: 6.671367E-01 | loss scale: 1.0 | grad norm: 1.008 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x55d956f537c0] mmco: unref short failure +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x55d956f537c0] mmco: unref short failure +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x55d956f537c0] mmco: unref short failure +[h264 @ 0x55d956f537c0] mmco: unref short failure +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x555df333fac0] mmco: unref short failure + [2024-11-28 04:37:59] iteration 602/ 1000 | consumed samples: 38528 | elapsed time per iteration (ms): 124385.1 | throughput per GPU (TFLOP/s/GPU): 62.0 | learning rate: 1.868758E-06 | global batch size: 64 | lm loss: 6.553986E-01 | loss scale: 1.0 | grad norm: 0.748 | number of skipped iterations: 0 | number of nan iterations: 0 | + [2024-11-28 04:39:27] iteration 603/ 1000 | consumed samples: 38592 | elapsed time per iteration (ms): 87787.7 | throughput per GPU (TFLOP/s/GPU): 87.8 | learning rate: 1.861140E-06 | global batch size: 64 | lm loss: 6.798328E-01 | loss scale: 1.0 | grad norm: 0.921 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x55d9560849c0] mmco: unref short failure +[h264 @ 0x55d9560849c0] mmco: unref short failure +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x555df32c1f40] mmco: unref short failure +[h264 @ 0x555df32c1f40] mmco: unref short failure +[h264 @ 0x55d956386f00] mmco: unref short failure +[h264 @ 0x55d956386f00] mmco: unref short failure +[h264 @ 0x55d9568d0900] mmco: unref short failure +[h264 @ 0x55d9568d0900] mmco: unref short failure +[h264 @ 0x555deda84c00] mmco: unref short failure +[h264 @ 0x555deda84c00] mmco: unref short failure +[h264 @ 0x55d9560849c0] mmco: unref short failure +[h264 @ 0x55d9560849c0] mmco: unref short failure +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x55d9560849c0] mmco: unref short failure +[h264 @ 0x55d9560849c0] mmco: unref short failure +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x55d9560849c0] mmco: unref short failure +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x55d9560849c0] mmco: unref short failure +[h264 @ 0x55d9560849c0] mmco: unref short failure +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x55d9560849c0] mmco: unref short failure +[h264 @ 0x55d9560849c0] mmco: unref short failure +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x555df4961e80] mmco: unref short failure +[h264 @ 0x555df4961e80] mmco: unref short failure +[h264 @ 0x55d957fbfd40] mmco: unref short failure +[h264 @ 0x55d957fbfd40] mmco: unref short failure +[h264 @ 0x55d95742d280] mmco: unref short failure +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x55d9560849c0] mmco: unref short failure +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x55d956386f00] mmco: unref short failure +[h264 @ 0x555dec9973c0] mmco: unref short failure + [2024-11-28 04:41:44] iteration 604/ 1000 | consumed samples: 38656 | elapsed time per iteration (ms): 137126.1 | throughput per GPU (TFLOP/s/GPU): 56.2 | learning rate: 1.853528E-06 | global batch size: 64 | lm loss: 6.614301E-01 | loss scale: 1.0 | grad norm: 0.804 | number of skipped iterations: 0 | number of nan iterations: 0 | + [2024-11-28 04:43:04] iteration 605/ 1000 | consumed samples: 38720 | elapsed time per iteration (ms): 80051.9 | throughput per GPU (TFLOP/s/GPU): 96.3 | learning rate: 1.845925E-06 | global batch size: 64 | lm loss: 6.650500E-01 | loss scale: 1.0 | grad norm: 0.907 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x55d9560849c0] mmco: unref short failure +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x55d9560849c0] mmco: unref short failure +[h264 @ 0x55d9560849c0] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x555dedb7a200] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x555dedb7a200] mmco: unref short failure +[h264 @ 0x555dedb7a200] mmco: unref short failure +[h264 @ 0x55d956b4f800] mmco: unref short failure +[h264 @ 0x55d956b4f800] mmco: unref short failure +[h264 @ 0x555df1cb0600] mmco: unref short failure +[h264 @ 0x555df1cb0600] mmco: unref short failure +[h264 @ 0x55d956b4f800] mmco: unref short failure +[h264 @ 0x55d956b4f800] mmco: unref short failure +[h264 @ 0x555df1cb0600] mmco: unref short failure +[h264 @ 0x555df1cb0600] mmco: unref short failure +[h264 @ 0x555df1cb0600] mmco: unref short failure +[h264 @ 0x55d956b4f800] mmco: unref short failure + [2024-11-28 04:44:41] iteration 606/ 1000 | consumed samples: 38784 | elapsed time per iteration (ms): 96758.0 | throughput per GPU (TFLOP/s/GPU): 79.7 | learning rate: 1.838328E-06 | global batch size: 64 | lm loss: 6.374074E-01 | loss scale: 1.0 | grad norm: 0.907 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555dee6ec240] mmco: unref short failure +[h264 @ 0x555dee6ec240] mmco: unref short failure +[h264 @ 0x55d95c76c1c0] mmco: unref short failure +[h264 @ 0x55d95c76c1c0] mmco: unref short failure + [2024-11-28 04:45:56] iteration 607/ 1000 | consumed samples: 38848 | elapsed time per iteration (ms): 74822.7 | throughput per GPU (TFLOP/s/GPU): 103.0 | learning rate: 1.830739E-06 | global batch size: 64 | lm loss: 6.324731E-01 | loss scale: 1.0 | grad norm: 0.765 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x55d95c76c1c0] mmco: unref short failure +[h264 @ 0x555dee6ec240] mmco: unref short failure +[h264 @ 0x555dee700e80] mmco: unref short failure +[h264 @ 0x555dee700e80] mmco: unref short failure +[h264 @ 0x555dee700e80] mmco: unref short failure +[h264 @ 0x555dee700e80] mmco: unref short failure +[h264 @ 0x55d9581bc7c0] mmco: unref short failure +[h264 @ 0x55d9581bc7c0] mmco: unref short failure +[h264 @ 0x55d9581bc7c0] mmco: unref short failure +[h264 @ 0x55d9581bc7c0] mmco: unref short failure +[h264 @ 0x555dee700e80] mmco: unref short failure +[h264 @ 0x555dee700e80] mmco: unref short failure +[h264 @ 0x55d9581bc7c0] mmco: unref short failure +[h264 @ 0x55d9581bc7c0] mmco: unref short failure + [2024-11-28 04:47:20] iteration 608/ 1000 | consumed samples: 38912 | elapsed time per iteration (ms): 83983.6 | throughput per GPU (TFLOP/s/GPU): 91.8 | learning rate: 1.823157E-06 | global batch size: 64 | lm loss: 7.577994E-01 | loss scale: 1.0 | grad norm: 0.919 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555dec6aab00] mmco: unref short failure +[h264 @ 0x555dec6aab00] mmco: unref short failure +[h264 @ 0x55d956f98f80] mmco: unref short failure +[h264 @ 0x55d956f98f80] mmco: unref short failure +[h264 @ 0x555dedb7a200] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x555dedb7a200] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x555dedb7a200] mmco: unref short failure +[h264 @ 0x555dedb7a200] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x555dee6ec240] mmco: unref short failure +[h264 @ 0x555dee6ec240] mmco: unref short failure +[h264 @ 0x55d95c76c1c0] mmco: unref short failure +[h264 @ 0x55d95c76c1c0] mmco: unref short failure +[h264 @ 0x555deee52400] mmco: unref short failure +[h264 @ 0x555deee52400] mmco: unref short failure +[h264 @ 0x55d956f98f80] mmco: unref short failure +[h264 @ 0x555deee52400] mmco: unref short failure +[h264 @ 0x555deee52400] mmco: unref short failure +[h264 @ 0x555deee52400] mmco: unref short failure +[h264 @ 0x555deee52400] mmco: unref short failure +[h264 @ 0x55d956f98f80] mmco: unref short failure +[h264 @ 0x55d956f98f80] mmco: unref short failure +[h264 @ 0x55d956f98f80] mmco: unref short failure +[h264 @ 0x55d956f98f80] mmco: unref short failure +[h264 @ 0x55d956f98f80] mmco: unref short failure +[h264 @ 0x555deda84c00] mmco: unref short failure +[h264 @ 0x55d9568d0900] mmco: unref short failure + [2024-11-28 04:48:36] iteration 609/ 1000 | consumed samples: 38976 | elapsed time per iteration (ms): 75702.5 | throughput per GPU (TFLOP/s/GPU): 101.8 | learning rate: 1.815584E-06 | global batch size: 64 | lm loss: 7.271475E-01 | loss scale: 1.0 | grad norm: 0.900 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555dee700e80] mmco: unref short failure +[h264 @ 0x55d9581bc7c0] mmco: unref short failure +[h264 @ 0x555dee700e80] mmco: unref short failure +[h264 @ 0x555dee700e80] mmco: unref short failure +[h264 @ 0x55d9581bc7c0] mmco: unref short failure +[h264 @ 0x55d9581bc7c0] mmco: unref short failure +[h264 @ 0x55d9581bc7c0] mmco: unref short failure +[h264 @ 0x55d9581bc7c0] mmco: unref short failure +[h264 @ 0x555dee700e80] mmco: unref short failure +[h264 @ 0x555dee700e80] mmco: unref short failure +[h264 @ 0x555dee700e80] mmco: unref short failure +[h264 @ 0x55d9581bc7c0] mmco: unref short failure + [2024-11-28 04:49:48] iteration 610/ 1000 | consumed samples: 39040 | elapsed time per iteration (ms): 71831.5 | throughput per GPU (TFLOP/s/GPU): 107.3 | learning rate: 1.808017E-06 | global batch size: 64 | lm loss: 7.437881E-01 | loss scale: 1.0 | grad norm: 0.901 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555dee792700] mmco: unref short failure +[h264 @ 0x55d957a1cec0] mmco: unref short failure +[h264 @ 0x555dee792700] mmco: unref short failure +[h264 @ 0x55d957a1cec0] mmco: unref short failure +[h264 @ 0x55d95c76c1c0] mmco: unref short failure +[h264 @ 0x55d95c76c1c0] mmco: unref short failure +[h264 @ 0x555dece4d600] mmco: unref short failure +[h264 @ 0x555dece4d600] mmco: unref short failure +[h264 @ 0x55d95742d280] mmco: unref short failure +[h264 @ 0x555dee700e80] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x55d9568d0900] mmco: unref short failure +[h264 @ 0x55d9568d0900] mmco: unref short failure +[h264 @ 0x555dedb7a200] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x55d9568d0900] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x55d9568d0900] mmco: unref short failure +[h264 @ 0x55d9568d0900] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x55d9568d0900] mmco: unref short failure +[h264 @ 0x55d9568d0900] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x55d9568d0900] mmco: unref short failure +[h264 @ 0x55d9568d0900] mmco: unref short failure +[h264 @ 0x55d9560849c0] mmco: unref short failure +[h264 @ 0x55d9560849c0] mmco: unref short failure +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x55d9560849c0] mmco: unref short failure +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x55d9560849c0] mmco: unref short failure +[h264 @ 0x55d9560849c0] mmco: unref short failure +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x55d9560849c0] mmco: unref short failure +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x55d9560849c0] mmco: unref short failure +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x55d9560849c0] Missing reference picture, default is 65540 +[h264 @ 0x55d9560849c0] mmco: unref short failure +[h264 @ 0x55d9560849c0] mmco: unref short failure +[h264 @ 0x55d9560849c0] Missing reference picture, default is 65540 +[h264 @ 0x55d9560849c0] mmco: unref short failure +[h264 @ 0x55d9560849c0] mmco: unref short failure +[h264 @ 0x555df333fac0] Missing reference picture, default is 65540 +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x555df333fac0] Missing reference picture, default is 65540 +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x555dee700e80] mmco: unref short failure +[h264 @ 0x555dee700e80] mmco: unref short failure +[h264 @ 0x55d95d112a80] mmco: unref short failure +[h264 @ 0x55d95d112a80] mmco: unref short failure + [2024-11-28 04:51:43] iteration 611/ 1000 | consumed samples: 39104 | elapsed time per iteration (ms): 115234.0 | throughput per GPU (TFLOP/s/GPU): 66.9 | learning rate: 1.800459E-06 | global batch size: 64 | lm loss: 6.608888E-01 | loss scale: 1.0 | grad norm: 0.997 | number of skipped iterations: 0 | number of nan iterations: 0 | + [2024-11-28 04:52:55] iteration 612/ 1000 | consumed samples: 39168 | elapsed time per iteration (ms): 72263.2 | throughput per GPU (TFLOP/s/GPU): 106.7 | learning rate: 1.792908E-06 | global batch size: 64 | lm loss: 6.323757E-01 | loss scale: 1.0 | grad norm: 0.876 | number of skipped iterations: 0 | number of nan iterations: 0 | + [2024-11-28 04:54:19] iteration 613/ 1000 | consumed samples: 39232 | elapsed time per iteration (ms): 84413.8 | throughput per GPU (TFLOP/s/GPU): 91.3 | learning rate: 1.785366E-06 | global batch size: 64 | lm loss: 6.873047E-01 | loss scale: 1.0 | grad norm: 1.022 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555dedbd1740] mmco: unref short failure +[h264 @ 0x555dedbd1740] mmco: unref short failure +[h264 @ 0x55d956d792c0] mmco: unref short failure +[h264 @ 0x55d956d792c0] mmco: unref short failure +[h264 @ 0x55d95d112a80] mmco: unref short failure +[h264 @ 0x55d95d112a80] mmco: unref short failure +[h264 @ 0x555dee700e80] mmco: unref short failure +[h264 @ 0x555dee700e80] mmco: unref short failure +[h264 @ 0x55d9565f0700] mmco: unref short failure +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x55d95d112a80] mmco: unref short failure +[h264 @ 0x55d95d112a80] mmco: unref short failure +[h264 @ 0x555dee700e80] mmco: unref short failure +[h264 @ 0x555dee700e80] mmco: unref short failure +[h264 @ 0x55d95d112a80] mmco: unref short failure +[h264 @ 0x55d95d112a80] mmco: unref short failure +[h264 @ 0x555dee700e80] mmco: unref short failure +[h264 @ 0x555dee700e80] mmco: unref short failure +[h264 @ 0x555def216fc0] mmco: unref short failure +[h264 @ 0x555def216fc0] mmco: unref short failure +[h264 @ 0x55d955d24640] mmco: unref short failure +[h264 @ 0x55d955d24640] mmco: unref short failure +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x555def216fc0] mmco: unref short failure +[h264 @ 0x555def216fc0] mmco: unref short failure +[h264 @ 0x555def216fc0] mmco: unref short failure +[h264 @ 0x55d955d24640] mmco: unref short failure +[h264 @ 0x55d955d24640] mmco: unref short failure +[h264 @ 0x55d955d24640] mmco: unref short failure +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x55d9565f0700] mmco: unref short failure +[h264 @ 0x55d9565f0700] mmco: unref short failure +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x55d95d112a80] mmco: unref short failure +[h264 @ 0x555dee700e80] mmco: unref short failure +[h264 @ 0x55d9565f0700] mmco: unref short failure +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x55d95d112a80] mmco: unref short failure +[h264 @ 0x55d95d112a80] mmco: unref short failure +[h264 @ 0x555dee700e80] mmco: unref short failure +[h264 @ 0x555dee700e80] mmco: unref short failure +[h264 @ 0x55d95d112a80] mmco: unref short failure +[h264 @ 0x55d95d112a80] mmco: unref short failure +[h264 @ 0x555dee700e80] mmco: unref short failure +[h264 @ 0x555dee700e80] mmco: unref short failure +[h264 @ 0x55d95d112a80] mmco: unref short failure +[h264 @ 0x555dee700e80] mmco: unref short failure +[h264 @ 0x55d956f98f80] mmco: unref short failure +[h264 @ 0x555def216fc0] mmco: unref short failure +[h264 @ 0x55d956f98f80] mmco: unref short failure +[h264 @ 0x555def216fc0] mmco: unref short failure +[h264 @ 0x55d956f98f80] mmco: unref short failure +[h264 @ 0x555def216fc0] mmco: unref short failure +[h264 @ 0x55d956f98f80] mmco: unref short failure +[h264 @ 0x55d956f98f80] mmco: unref short failure +[h264 @ 0x555def216fc0] mmco: unref short failure +[h264 @ 0x555def216fc0] mmco: unref short failure +[h264 @ 0x55d956f98f80] mmco: unref short failure +[h264 @ 0x55d956f98f80] mmco: unref short failure +[h264 @ 0x555def216fc0] mmco: unref short failure +[h264 @ 0x555def216fc0] mmco: unref short failure + [2024-11-28 04:56:25] iteration 614/ 1000 | consumed samples: 39296 | elapsed time per iteration (ms): 125544.5 | throughput per GPU (TFLOP/s/GPU): 61.4 | learning rate: 1.777831E-06 | global batch size: 64 | lm loss: 6.434957E-01 | loss scale: 1.0 | grad norm: 0.849 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555dee0f8a80] mmco: unref short failure +[h264 @ 0x55d9568d0900] mmco: unref short failure +[h264 @ 0x555dee0f8a80] mmco: unref short failure +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x55d9568d0900] mmco: unref short failure +[h264 @ 0x555ded70ae00] mmco: unref short failure +[h264 @ 0x555ded70ae00] mmco: unref short failure +[h264 @ 0x55d956f98f80] mmco: unref short failure +[h264 @ 0x55d956f98f80] mmco: unref short failure +[h264 @ 0x555def251b40] mmco: unref short failure +[h264 @ 0x555def251b40] mmco: unref short failure +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x55d957fe7780] mmco: unref short failure +[h264 @ 0x55d957fe7780] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x555def251b40] mmco: unref short failure +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x55d957fe7780] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure + [2024-11-28 04:57:56] iteration 615/ 1000 | consumed samples: 39360 | elapsed time per iteration (ms): 91010.5 | throughput per GPU (TFLOP/s/GPU): 84.7 | learning rate: 1.770305E-06 | global batch size: 64 | lm loss: 6.286437E-01 | loss scale: 1.0 | grad norm: 0.812 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x55d957fbfd40] mmco: unref short failure +[h264 @ 0x55d957fbfd40] mmco: unref short failure +[h264 @ 0x555df4169bc0] mmco: unref short failure + [2024-11-28 04:59:23] iteration 616/ 1000 | consumed samples: 39424 | elapsed time per iteration (ms): 87353.2 | throughput per GPU (TFLOP/s/GPU): 88.2 | learning rate: 1.762786E-06 | global batch size: 64 | lm loss: 6.583841E-01 | loss scale: 1.0 | grad norm: 0.871 | number of skipped iterations: 0 | number of nan iterations: 0 | + [2024-11-28 05:00:51] iteration 617/ 1000 | consumed samples: 39488 | elapsed time per iteration (ms): 87235.7 | throughput per GPU (TFLOP/s/GPU): 88.4 | learning rate: 1.755276E-06 | global batch size: 64 | lm loss: 6.181954E-01 | loss scale: 1.0 | grad norm: 0.806 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x55d9560849c0] mmco: unref short failure +[h264 @ 0x55d9560849c0] mmco: unref short failure +[h264 @ 0x55d956b88f40] mmco: unref short failure +[h264 @ 0x55d956b88f40] mmco: unref short failure +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x55d9565f0700] mmco: unref short failure +[h264 @ 0x55d956b88f40] mmco: unref short failure +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x55d9560849c0] mmco: unref short failure +[h264 @ 0x55d9560849c0] mmco: unref short failure +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x55d9560849c0] mmco: unref short failure +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x55d9560849c0] mmco: unref short failure +[h264 @ 0x55d9560849c0] mmco: unref short failure +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x55d9560849c0] mmco: unref short failure +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x55d956b88f40] mmco: unref short failure +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x55d956b88f40] mmco: unref short failure +[h264 @ 0x55d956b88f40] mmco: unref short failure +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x55d956b88f40] mmco: unref short failure + [2024-11-28 05:02:07] iteration 618/ 1000 | consumed samples: 39552 | elapsed time per iteration (ms): 75907.6 | throughput per GPU (TFLOP/s/GPU): 101.6 | learning rate: 1.747775E-06 | global batch size: 64 | lm loss: 6.557152E-01 | loss scale: 1.0 | grad norm: 0.881 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555df4961e80] mmco: unref short failure +[h264 @ 0x55d9565f0700] mmco: unref short failure +[h264 @ 0x55d956f98f80] mmco: unref short failure +[h264 @ 0x55d956f98f80] mmco: unref short failure +[h264 @ 0x555ded70ae00] mmco: unref short failure +[h264 @ 0x555ded70ae00] mmco: unref short failure +[h264 @ 0x555ded70ae00] mmco: unref short failure +[h264 @ 0x55d956042040] mmco: unref short failure +[h264 @ 0x555dee700e80] mmco: unref short failure +[h264 @ 0x55d956d792c0] mmco: unref short failure +[h264 @ 0x555df4961e80] mmco: unref short failure +[h264 @ 0x55d9565f0700] mmco: unref short failure + [2024-11-28 05:03:45] iteration 619/ 1000 | consumed samples: 39616 | elapsed time per iteration (ms): 98071.4 | throughput per GPU (TFLOP/s/GPU): 78.6 | learning rate: 1.740281E-06 | global batch size: 64 | lm loss: 7.434057E-01 | loss scale: 1.0 | grad norm: 1.612 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x555dec5a5140] mmco: unref short failure +[h264 @ 0x555dec5a5140] mmco: unref short failure +[h264 @ 0x55d957883f80] mmco: unref short failure +[h264 @ 0x55d957883f80] mmco: unref short failure +[h264 @ 0x555deda84c00] mmco: unref short failure +[h264 @ 0x55d95678a600] mmco: unref short failure +[h264 @ 0x555ded70ae00] mmco: unref short failure +[h264 @ 0x55d957fe7780] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x555deda84c00] mmco: unref short failure +[h264 @ 0x55d9568d0900] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x55d9568d0900] mmco: unref short failure +[h264 @ 0x55d9568d0900] mmco: unref short failure +[h264 @ 0x55d95678a600] mmco: unref short failure + [2024-11-28 05:05:04] iteration 620/ 1000 | consumed samples: 39680 | elapsed time per iteration (ms): 78880.8 | throughput per GPU (TFLOP/s/GPU): 97.7 | learning rate: 1.732797E-06 | global batch size: 64 | lm loss: 6.678324E-01 | loss scale: 1.0 | grad norm: 1.186 | number of skipped iterations: 0 | number of nan iterations: 0 | + [2024-11-28 05:06:53] iteration 621/ 1000 | consumed samples: 39744 | elapsed time per iteration (ms): 109711.4 | throughput per GPU (TFLOP/s/GPU): 70.3 | learning rate: 1.725320E-06 | global batch size: 64 | lm loss: 6.870947E-01 | loss scale: 1.0 | grad norm: 0.873 | number of skipped iterations: 0 | number of nan iterations: 0 | + [2024-11-28 05:08:28] iteration 622/ 1000 | consumed samples: 39808 | elapsed time per iteration (ms): 94990.9 | throughput per GPU (TFLOP/s/GPU): 81.2 | learning rate: 1.717853E-06 | global batch size: 64 | lm loss: 6.423949E-01 | loss scale: 1.0 | grad norm: 0.913 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x555dec5a5140] mmco: unref short failure +[h264 @ 0x555dec5a5140] mmco: unref short failure +[h264 @ 0x55d957883f80] mmco: unref short failure +[h264 @ 0x55d957883f80] mmco: unref short failure +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x555ded70ae00] mmco: unref short failure +[h264 @ 0x55d9560849c0] mmco: unref short failure +[h264 @ 0x55d9560849c0] mmco: unref short failure +[h264 @ 0x55d957fe7780] mmco: unref short failure +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x55d9560849c0] mmco: unref short failure +[h264 @ 0x55d9560849c0] mmco: unref short failure +[h264 @ 0x55d9560849c0] mmco: unref short failure +[h264 @ 0x55d9560849c0] mmco: unref short failure +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x55d9560849c0] mmco: unref short failure +[h264 @ 0x55d9560849c0] mmco: unref short failure +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x55d9560849c0] mmco: unref short failure +[h264 @ 0x55d9560849c0] mmco: unref short failure +[h264 @ 0x555ded70ae00] mmco: unref short failure +[h264 @ 0x55d957fe7780] mmco: unref short failure +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x55d9560849c0] mmco: unref short failure +[h264 @ 0x55d9560849c0] mmco: unref short failure +[h264 @ 0x55d9560849c0] mmco: unref short failure +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x55d9560849c0] mmco: unref short failure +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x55d9560849c0] mmco: unref short failure + [2024-11-28 05:10:16] iteration 623/ 1000 | consumed samples: 39872 | elapsed time per iteration (ms): 107279.7 | throughput per GPU (TFLOP/s/GPU): 71.9 | learning rate: 1.710394E-06 | global batch size: 64 | lm loss: 6.864921E-01 | loss scale: 1.0 | grad norm: 0.850 | number of skipped iterations: 0 | number of nan iterations: 0 | + [2024-11-28 05:11:34] iteration 624/ 1000 | consumed samples: 39936 | elapsed time per iteration (ms): 78100.2 | throughput per GPU (TFLOP/s/GPU): 98.7 | learning rate: 1.702944E-06 | global batch size: 64 | lm loss: 6.161958E-01 | loss scale: 1.0 | grad norm: 0.932 | number of skipped iterations: 0 | number of nan iterations: 0 | + [2024-11-28 05:12:52] iteration 625/ 1000 | consumed samples: 40000 | elapsed time per iteration (ms): 78301.6 | throughput per GPU (TFLOP/s/GPU): 98.4 | learning rate: 1.695503E-06 | global batch size: 64 | lm loss: 6.460192E-01 | loss scale: 1.0 | grad norm: 2.826 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x555def216fc0] mmco: unref short failure +[h264 @ 0x55d957a91640] mmco: unref short failure +[h264 @ 0x555def216fc0] mmco: unref short failure +[h264 @ 0x55d957a91640] mmco: unref short failure +[h264 @ 0x55d9568d0900] mmco: unref short failure +[h264 @ 0x555deda84c00] mmco: unref short failure +[h264 @ 0x555deda84c00] mmco: unref short failure +[h264 @ 0x55d955b8bcc0] mmco: unref short failure +[h264 @ 0x55d955b8bcc0] mmco: unref short failure +[h264 @ 0x55d957a91640] mmco: unref short failure +[h264 @ 0x555def216fc0] mmco: unref short failure +[h264 @ 0x55d957a91640] mmco: unref short failure +[h264 @ 0x555def216fc0] mmco: unref short failure +[h264 @ 0x555def216fc0] mmco: unref short failure +[h264 @ 0x55d957a91640] mmco: unref short failure +[h264 @ 0x555def216fc0] mmco: unref short failure +[h264 @ 0x55d957a91640] mmco: unref short failure +[h264 @ 0x555def216fc0] mmco: unref short failure +[h264 @ 0x55d957a91640] mmco: unref short failure +[h264 @ 0x555def216fc0] mmco: unref short failure +[h264 @ 0x55d957a91640] mmco: unref short failure +[h264 @ 0x555def216fc0] mmco: unref short failure +[h264 @ 0x55d957a91640] mmco: unref short failure +[h264 @ 0x555def216fc0] mmco: unref short failure +[h264 @ 0x55d957a91640] mmco: unref short failure +[h264 @ 0x555def216fc0] mmco: unref short failure +[h264 @ 0x55d957a91640] mmco: unref short failure +[h264 @ 0x555def216fc0] mmco: unref short failure +[h264 @ 0x55d957a91640] mmco: unref short failure +[h264 @ 0x555def216fc0] mmco: unref short failure +[h264 @ 0x55d957a91640] mmco: unref short failure +[h264 @ 0x555def216fc0] mmco: unref short failure +[h264 @ 0x55d957a91640] mmco: unref short failure +[h264 @ 0x555def216fc0] mmco: unref short failure +[h264 @ 0x55d957a91640] mmco: unref short failure +[h264 @ 0x555def216fc0] mmco: unref short failure +[h264 @ 0x55d957a91640] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x555dec5a5140] mmco: unref short failure +[h264 @ 0x55d9559dfc40] mmco: unref short failure +[h264 @ 0x555ded679600] mmco: unref short failure +[h264 @ 0x55d956b88f40] mmco: unref short failure +[h264 @ 0x555ded70ae00] mmco: unref short failure +[h264 @ 0x55d957fe7780] mmco: unref short failure + [2024-11-28 05:14:10] iteration 626/ 1000 | consumed samples: 40064 | elapsed time per iteration (ms): 78201.6 | throughput per GPU (TFLOP/s/GPU): 98.6 | learning rate: 1.688070E-06 | global batch size: 64 | lm loss: 6.443557E-01 | loss scale: 1.0 | grad norm: 0.823 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x55d955b8bcc0] mmco: unref short failure +[h264 @ 0x555deda84c00] mmco: unref short failure +[h264 @ 0x55d956b88f40] mmco: unref short failure +[h264 @ 0x555ded679600] mmco: unref short failure +[h264 @ 0x55d955b8bcc0] mmco: unref short failure +[h264 @ 0x555deda84c00] mmco: unref short failure +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x55d9560849c0] mmco: unref short failure +[h264 @ 0x55d9560849c0] mmco: unref short failure +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x55d9560849c0] mmco: unref short failure +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x55d9560849c0] mmco: unref short failure +[h264 @ 0x55d9560849c0] mmco: unref short failure +[h264 @ 0x55d9560849c0] mmco: unref short failure +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x55d9560849c0] mmco: unref short failure +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x55d9560849c0] mmco: unref short failure +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x55d9560849c0] mmco: unref short failure +[h264 @ 0x55d9560849c0] mmco: unref short failure + [2024-11-28 05:15:44] iteration 627/ 1000 | consumed samples: 40128 | elapsed time per iteration (ms): 93670.8 | throughput per GPU (TFLOP/s/GPU): 82.3 | learning rate: 1.680647E-06 | global batch size: 64 | lm loss: 6.387815E-01 | loss scale: 1.0 | grad norm: 0.940 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x55d959b34680] mmco: unref short failure +[h264 @ 0x555dedf05880] mmco: unref short failure +[h264 @ 0x55d959b34680] mmco: unref short failure +[h264 @ 0x55d959b34680] mmco: unref short failure +[h264 @ 0x555dedf05880] mmco: unref short failure +[h264 @ 0x555dedf05880] mmco: unref short failure +[h264 @ 0x55d959b34680] mmco: unref short failure +[h264 @ 0x555dedf05880] mmco: unref short failure +[h264 @ 0x555dedf89ec0] mmco: unref short failure +[h264 @ 0x555dedf89ec0] mmco: unref short failure +[h264 @ 0x555dedf89ec0] mmco: unref short failure +[h264 @ 0x555dedf89ec0] mmco: unref short failure +[h264 @ 0x55d957a03080] mmco: unref short failure +[h264 @ 0x55d957a03080] mmco: unref short failure +[h264 @ 0x55d957a03080] mmco: unref short failure +[h264 @ 0x55d957a03080] mmco: unref short failure +[h264 @ 0x555dece4d600] mmco: unref short failure +[h264 @ 0x555dece4d600] mmco: unref short failure +[h264 @ 0x555dece4d600] mmco: unref short failure +[h264 @ 0x555dece4d600] mmco: unref short failure +[h264 @ 0x555dece4d600] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x555dece4d600] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x555dece4d600] mmco: unref short failure +[h264 @ 0x555dece4d600] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x555dee948bc0] mmco: unref short failure +[h264 @ 0x555dee948bc0] mmco: unref short failure +[h264 @ 0x55d956b9c3c0] mmco: unref short failure +[h264 @ 0x55d956b9c3c0] mmco: unref short failure +[h264 @ 0x555ded70ae00] mmco: unref short failure +[h264 @ 0x55d957fe7780] mmco: unref short failure + [2024-11-28 05:17:20] iteration 628/ 1000 | consumed samples: 40192 | elapsed time per iteration (ms): 96292.0 | throughput per GPU (TFLOP/s/GPU): 80.1 | learning rate: 1.673233E-06 | global batch size: 64 | lm loss: 6.631010E-01 | loss scale: 1.0 | grad norm: 0.909 | number of skipped iterations: 0 | number of nan iterations: 0 | + [2024-11-28 05:18:48] iteration 629/ 1000 | consumed samples: 40256 | elapsed time per iteration (ms): 88126.4 | throughput per GPU (TFLOP/s/GPU): 87.5 | learning rate: 1.665828E-06 | global batch size: 64 | lm loss: 6.977863E-01 | loss scale: 1.0 | grad norm: 1.025 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555dedf89ec0] mmco: unref short failure +[h264 @ 0x55d956f98f80] mmco: unref short failure +[h264 @ 0x555dece9f180] mmco: unref short failure +[h264 @ 0x55d955d24640] mmco: unref short failure +[h264 @ 0x55d959b34680] mmco: unref short failure +[h264 @ 0x55d959b34680] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x555dece9f180] mmco: unref short failure +[h264 @ 0x555dece9f180] mmco: unref short failure +[h264 @ 0x55d955d24640] mmco: unref short failure +[h264 @ 0x55d955d24640] mmco: unref short failure +[h264 @ 0x555dece9f180] mmco: unref short failure +[h264 @ 0x555dece9f180] mmco: unref short failure +[h264 @ 0x55d955d24640] mmco: unref short failure +[h264 @ 0x55d955d24640] mmco: unref short failure +[h264 @ 0x555deda84c00] mmco: unref short failure +[h264 @ 0x555deda84c00] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure + [2024-11-28 05:20:05] iteration 630/ 1000 | consumed samples: 40320 | elapsed time per iteration (ms): 76528.1 | throughput per GPU (TFLOP/s/GPU): 100.7 | learning rate: 1.658433E-06 | global batch size: 64 | lm loss: 6.857378E-01 | loss scale: 1.0 | grad norm: 0.868 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x55d959b34680] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x55d959b34680] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x55d959b34680] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x55d959b34680] mmco: unref short failure +[h264 @ 0x55d959b34680] mmco: unref short failure +[h264 @ 0x55d959b34680] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x55d959b34680] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x55d959b34680] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x555dece9f180] mmco: unref short failure +[h264 @ 0x55d955d24640] mmco: unref short failure +[h264 @ 0x555dece9f180] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x555deda84c00] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x555ded70ae00] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x555ded70ae00] mmco: unref short failure +[h264 @ 0x555ded70ae00] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x555ded70ae00] mmco: unref short failure +[h264 @ 0x555ded70ae00] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x555ded70ae00] mmco: unref short failure +[h264 @ 0x555ded70ae00] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure + [2024-11-28 05:21:10] iteration 631/ 1000 | consumed samples: 40384 | elapsed time per iteration (ms): 65241.8 | throughput per GPU (TFLOP/s/GPU): 118.2 | learning rate: 1.651047E-06 | global batch size: 64 | lm loss: 6.398815E-01 | loss scale: 1.0 | grad norm: 1.143 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555ded70ae00] mmco: unref short failure +[h264 @ 0x555ded70ae00] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x555deda84c00] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure + [2024-11-28 05:22:28] iteration 632/ 1000 | consumed samples: 40448 | elapsed time per iteration (ms): 77900.5 | throughput per GPU (TFLOP/s/GPU): 99.0 | learning rate: 1.643670E-06 | global batch size: 64 | lm loss: 6.859981E-01 | loss scale: 1.0 | grad norm: 0.871 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555deda84c00] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x555deda84c00] mmco: unref short failure +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x555ded70ae00] mmco: unref short failure +[h264 @ 0x55d957fe7780] mmco: unref short failure +[h264 @ 0x555ded70ae00] mmco: unref short failure +[h264 @ 0x55d957fe7780] mmco: unref short failure + [2024-11-28 05:23:53] iteration 633/ 1000 | consumed samples: 40512 | elapsed time per iteration (ms): 84697.6 | throughput per GPU (TFLOP/s/GPU): 91.0 | learning rate: 1.636303E-06 | global batch size: 64 | lm loss: 8.127314E-01 | loss scale: 1.0 | grad norm: 0.874 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x55d957fe7780] mmco: unref short failure +[h264 @ 0x55d957fe7780] mmco: unref short failure +[h264 @ 0x555ded70ae00] mmco: unref short failure +[h264 @ 0x555ded70ae00] mmco: unref short failure + [2024-11-28 05:25:42] iteration 634/ 1000 | consumed samples: 40576 | elapsed time per iteration (ms): 109558.1 | throughput per GPU (TFLOP/s/GPU): 70.4 | learning rate: 1.628945E-06 | global batch size: 64 | lm loss: 7.018932E-01 | loss scale: 1.0 | grad norm: 1.332 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x55d956386f00] mmco: unref short failure + [2024-11-28 05:27:01] iteration 635/ 1000 | consumed samples: 40640 | elapsed time per iteration (ms): 78317.2 | throughput per GPU (TFLOP/s/GPU): 98.4 | learning rate: 1.621597E-06 | global batch size: 64 | lm loss: 6.609097E-01 | loss scale: 1.0 | grad norm: 0.959 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x55d959b34680] mmco: unref short failure +[h264 @ 0x55d959b34680] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x55d959b34680] mmco: unref short failure +[h264 @ 0x55d959b34680] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure + [2024-11-28 05:28:21] iteration 636/ 1000 | consumed samples: 40704 | elapsed time per iteration (ms): 79989.3 | throughput per GPU (TFLOP/s/GPU): 96.4 | learning rate: 1.614259E-06 | global batch size: 64 | lm loss: 6.400303E-01 | loss scale: 1.0 | grad norm: 0.997 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555def251b40] mmco: unref short failure +[h264 @ 0x55d956d792c0] mmco: unref short failure +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x55d957a91640] mmco: unref short failure +[h264 @ 0x555deda84c00] mmco: unref short failure +[h264 @ 0x555deda84c00] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x555deda84c00] mmco: unref short failure +[h264 @ 0x555deda84c00] mmco: unref short failure +[h264 @ 0x55d955b8bcc0] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x55d955b8bcc0] mmco: unref short failure +[h264 @ 0x55d955b8bcc0] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x55d955b8bcc0] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x55d955b8bcc0] mmco: unref short failure +[h264 @ 0x55d955b8bcc0] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x55d955b8bcc0] mmco: unref short failure +[h264 @ 0x55d955b8bcc0] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure + [2024-11-28 05:29:30] iteration 637/ 1000 | consumed samples: 40768 | elapsed time per iteration (ms): 69397.2 | throughput per GPU (TFLOP/s/GPU): 111.1 | learning rate: 1.606930E-06 | global batch size: 64 | lm loss: 6.713878E-01 | loss scale: 1.0 | grad norm: 0.830 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x55d959b34680] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure + [2024-11-28 05:31:01] iteration 638/ 1000 | consumed samples: 40832 | elapsed time per iteration (ms): 90607.7 | throughput per GPU (TFLOP/s/GPU): 85.1 | learning rate: 1.599612E-06 | global batch size: 64 | lm loss: 6.384090E-01 | loss scale: 1.0 | grad norm: 0.782 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555df3e6f800] mmco: unref short failure +[h264 @ 0x555df3e6f800] mmco: unref short failure +[h264 @ 0x55d956b52f40] mmco: unref short failure +[h264 @ 0x55d956b52f40] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x55d959b34680] mmco: unref short failure +[h264 @ 0x55d95c76c1c0] mmco: unref short failure +[h264 @ 0x55d95c76c1c0] mmco: unref short failure +[h264 @ 0x555def216fc0] mmco: unref short failure +[h264 @ 0x555def216fc0] mmco: unref short failure +[h264 @ 0x55d95c76c1c0] mmco: unref short failure +[h264 @ 0x555def216fc0] mmco: unref short failure +[h264 @ 0x55d95c76c1c0] mmco: unref short failure +[h264 @ 0x555def216fc0] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x55d9560849c0] mmco: unref short failure +[h264 @ 0x55d9560849c0] mmco: unref short failure + [2024-11-28 05:32:23] iteration 639/ 1000 | consumed samples: 40896 | elapsed time per iteration (ms): 82721.4 | throughput per GPU (TFLOP/s/GPU): 93.2 | learning rate: 1.592303E-06 | global batch size: 64 | lm loss: 6.337037E-01 | loss scale: 1.0 | grad norm: 0.892 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555deda84c00] mmco: unref short failure +[h264 @ 0x555deda84c00] mmco: unref short failure +[h264 @ 0x55d957a03080] mmco: unref short failure +[h264 @ 0x55d957a03080] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x55d9560849c0] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x55d9560849c0] mmco: unref short failure +[h264 @ 0x555def216fc0] mmco: unref short failure +[h264 @ 0x55d9568d0900] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x55d9560849c0] mmco: unref short failure +[h264 @ 0x55d9560849c0] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x55d9560849c0] mmco: unref short failure +[h264 @ 0x555def216fc0] mmco: unref short failure +[h264 @ 0x55d9568d0900] mmco: unref short failure + [2024-11-28 05:33:53] iteration 640/ 1000 | consumed samples: 40960 | elapsed time per iteration (ms): 89577.6 | throughput per GPU (TFLOP/s/GPU): 86.1 | learning rate: 1.585004E-06 | global batch size: 64 | lm loss: 6.542257E-01 | loss scale: 1.0 | grad norm: 0.906 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x555dec64f880] mmco: unref short failure +[h264 @ 0x555dec64f880] mmco: unref short failure +[h264 @ 0x555dec64f880] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure + [2024-11-28 05:35:43] iteration 641/ 1000 | consumed samples: 41024 | elapsed time per iteration (ms): 110295.5 | throughput per GPU (TFLOP/s/GPU): 69.9 | learning rate: 1.577716E-06 | global batch size: 64 | lm loss: 6.256745E-01 | loss scale: 1.0 | grad norm: 0.804 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure + [2024-11-28 05:37:04] iteration 642/ 1000 | consumed samples: 41088 | elapsed time per iteration (ms): 80229.3 | throughput per GPU (TFLOP/s/GPU): 96.1 | learning rate: 1.570438E-06 | global batch size: 64 | lm loss: 7.508308E-01 | loss scale: 1.0 | grad norm: 1.025 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x55d957fbfd40] mmco: unref short failure +[h264 @ 0x55d957fbfd40] mmco: unref short failure +[h264 @ 0x555dee0de680] mmco: unref short failure +[h264 @ 0x555dee0de680] mmco: unref short failure +[h264 @ 0x55d9560849c0] mmco: unref short failure +[h264 @ 0x55d9560849c0] mmco: unref short failure +[h264 @ 0x55d9569edbc0] mmco: unref short failure +[h264 @ 0x55d9569edbc0] mmco: unref short failure +[h264 @ 0x555dedf05880] mmco: unref short failure +[h264 @ 0x555dedf05880] mmco: unref short failure +[h264 @ 0x55d9569edbc0] mmco: unref short failure +[h264 @ 0x55d9569edbc0] mmco: unref short failure +[h264 @ 0x555dedf05880] mmco: unref short failure +[h264 @ 0x555dedf05880] mmco: unref short failure +[h264 @ 0x555ded7e1040] mmco: unref short failure +[h264 @ 0x55d9560aea40] mmco: unref short failure +[h264 @ 0x555ded7e1040] mmco: unref short failure +[h264 @ 0x555ded7e1040] mmco: unref short failure +[h264 @ 0x55d9560aea40] mmco: unref short failure +[h264 @ 0x55d9560aea40] mmco: unref short failure +[h264 @ 0x555ded7e1040] mmco: unref short failure +[h264 @ 0x55d9560aea40] mmco: unref short failure + [2024-11-28 05:38:14] iteration 643/ 1000 | consumed samples: 41152 | elapsed time per iteration (ms): 70229.7 | throughput per GPU (TFLOP/s/GPU): 109.8 | learning rate: 1.563170E-06 | global batch size: 64 | lm loss: 6.440120E-01 | loss scale: 1.0 | grad norm: 1.269 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555dee0de680] mmco: unref short failure +[h264 @ 0x55d9560849c0] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x55d957f0f640] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x55d957f0f640] mmco: unref short failure + [2024-11-28 05:39:52] iteration 644/ 1000 | consumed samples: 41216 | elapsed time per iteration (ms): 98058.6 | throughput per GPU (TFLOP/s/GPU): 78.6 | learning rate: 1.555912E-06 | global batch size: 64 | lm loss: 7.841610E-01 | loss scale: 1.0 | grad norm: 1.065 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555ded7e1040] mmco: unref short failure +[h264 @ 0x55d9560aea40] mmco: unref short failure +[h264 @ 0x555ded7e1040] mmco: unref short failure +[h264 @ 0x55d9560aea40] mmco: unref short failure + [2024-11-28 05:41:15] iteration 645/ 1000 | consumed samples: 41280 | elapsed time per iteration (ms): 82685.7 | throughput per GPU (TFLOP/s/GPU): 93.2 | learning rate: 1.548665E-06 | global batch size: 64 | lm loss: 6.619388E-01 | loss scale: 1.0 | grad norm: 0.858 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x55d957f0f640] mmco: unref short failure +[h264 @ 0x55d957f0f640] mmco: unref short failure +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x55d95c76c1c0] mmco: unref short failure +[h264 @ 0x55d95c76c1c0] mmco: unref short failure +[h264 @ 0x55d95c76c1c0] mmco: unref short failure +[h264 @ 0x55d95c76c1c0] mmco: unref short failure +[h264 @ 0x555df32c1f40] mmco: unref short failure +[h264 @ 0x555df32c1f40] mmco: unref short failure +[h264 @ 0x55d955d53340] mmco: unref short failure +[h264 @ 0x55d955d53340] mmco: unref short failure + [2024-11-28 05:42:54] iteration 646/ 1000 | consumed samples: 41344 | elapsed time per iteration (ms): 99243.9 | throughput per GPU (TFLOP/s/GPU): 77.7 | learning rate: 1.541428E-06 | global batch size: 64 | lm loss: 6.424022E-01 | loss scale: 1.0 | grad norm: 1.142 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555def216fc0] mmco: unref short failure +[h264 @ 0x555def216fc0] mmco: unref short failure +[h264 @ 0x55d9568d0900] mmco: unref short failure +[h264 @ 0x55d9568d0900] mmco: unref short failure +[h264 @ 0x555def216fc0] mmco: unref short failure +[h264 @ 0x55d9568d0900] mmco: unref short failure +[h264 @ 0x555def216fc0] mmco: unref short failure +[h264 @ 0x55d9568d0900] mmco: unref short failure +[h264 @ 0x555def216fc0] mmco: unref short failure +[h264 @ 0x55d9568d0900] mmco: unref short failure +[h264 @ 0x555ded63c840] mmco: unref short failure +[h264 @ 0x555ded63c840] mmco: unref short failure +[h264 @ 0x55d95b718d00] mmco: unref short failure +[h264 @ 0x55d95b718d00] mmco: unref short failure +[h264 @ 0x55d9560aea40] mmco: unref short failure +[h264 @ 0x55d9560aea40] mmco: unref short failure +[h264 @ 0x555ded7e1040] mmco: unref short failure +[h264 @ 0x555ded7e1040] mmco: unref short failure +[h264 @ 0x55d956f98f80] mmco: unref short failure +[h264 @ 0x555dee700e80] mmco: unref short failure +[h264 @ 0x555df32c1f40] mmco: unref short failure +[h264 @ 0x555df32c1f40] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x555df32c1f40] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure + [2024-11-28 05:44:18] iteration 647/ 1000 | consumed samples: 41408 | elapsed time per iteration (ms): 84013.8 | throughput per GPU (TFLOP/s/GPU): 91.8 | learning rate: 1.534202E-06 | global batch size: 64 | lm loss: 6.809196E-01 | loss scale: 1.0 | grad norm: 0.886 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x55d956f98f80] mmco: unref short failure +[h264 @ 0x555dee700e80] mmco: unref short failure +[h264 @ 0x555dedf89ec0] mmco: unref short failure +[h264 @ 0x555dedf89ec0] mmco: unref short failure +[h264 @ 0x55d957a91640] mmco: unref short failure +[h264 @ 0x55d957a91640] mmco: unref short failure + [2024-11-28 05:45:31] iteration 648/ 1000 | consumed samples: 41472 | elapsed time per iteration (ms): 73201.4 | throughput per GPU (TFLOP/s/GPU): 105.3 | learning rate: 1.526987E-06 | global batch size: 64 | lm loss: 6.052883E-01 | loss scale: 1.0 | grad norm: 1.352 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555dedf05880] mmco: unref short failure +[h264 @ 0x555dedf05880] mmco: unref short failure +[h264 @ 0x55d956386f00] mmco: unref short failure +[h264 @ 0x55d956386f00] mmco: unref short failure +[h264 @ 0x555df4961e80] mmco: unref short failure +[h264 @ 0x55d955d4f640] mmco: unref short failure +[h264 @ 0x55d9568d0900] mmco: unref short failure +[h264 @ 0x55d9568d0900] mmco: unref short failure +[h264 @ 0x555def216fc0] mmco: unref short failure +[h264 @ 0x555def216fc0] mmco: unref short failure +[h264 @ 0x555df4961e80] mmco: unref short failure +[h264 @ 0x555df4961e80] mmco: unref short failure +[h264 @ 0x55d955d4f640] mmco: unref short failure +[h264 @ 0x55d955d4f640] mmco: unref short failure +[h264 @ 0x555df4961e80] mmco: unref short failure +[h264 @ 0x555df4961e80] mmco: unref short failure +[h264 @ 0x55d955d4f640] mmco: unref short failure +[h264 @ 0x55d955d4f640] mmco: unref short failure +[h264 @ 0x555df4961e80] mmco: unref short failure +[h264 @ 0x55d955d4f640] mmco: unref short failure + [2024-11-28 05:48:26] iteration 649/ 1000 | consumed samples: 41536 | elapsed time per iteration (ms): 174738.1 | throughput per GPU (TFLOP/s/GPU): 44.1 | learning rate: 1.519782E-06 | global batch size: 64 | lm loss: 6.482288E-01 | loss scale: 1.0 | grad norm: 0.887 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555def216fc0] mmco: unref short failure +[h264 @ 0x55d9568d0900] mmco: unref short failure +[h264 @ 0x555def216fc0] mmco: unref short failure +[h264 @ 0x555def216fc0] mmco: unref short failure +[h264 @ 0x55d9568d0900] mmco: unref short failure +[h264 @ 0x55d9568d0900] mmco: unref short failure +[h264 @ 0x555def216fc0] mmco: unref short failure +[h264 @ 0x55d9568d0900] mmco: unref short failure + [2024-11-28 05:49:37] iteration 650/ 1000 | consumed samples: 41600 | elapsed time per iteration (ms): 71547.4 | throughput per GPU (TFLOP/s/GPU): 107.7 | learning rate: 1.512588E-06 | global batch size: 64 | lm loss: 7.025335E-01 | loss scale: 1.0 | grad norm: 0.772 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555df4961e80] mmco: unref short failure +[h264 @ 0x555df4961e80] mmco: unref short failure +[h264 @ 0x55d955d4f640] mmco: unref short failure +[h264 @ 0x55d955d4f640] mmco: unref short failure +[h264 @ 0x55d9586ad200] mmco: unref short failure +[h264 @ 0x55d9586ad200] mmco: unref short failure +[h264 @ 0x555deebbd8c0] mmco: unref short failure +[h264 @ 0x555deebbd8c0] mmco: unref short failure +[h264 @ 0x55d955d4f640] mmco: unref short failure +[h264 @ 0x55d955d4f640] mmco: unref short failure +[h264 @ 0x555df4961e80] mmco: unref short failure +[h264 @ 0x555df4961e80] mmco: unref short failure +[h264 @ 0x55d955d4f640] mmco: unref short failure +[h264 @ 0x555df4961e80] mmco: unref short failure +[h264 @ 0x555def216fc0] mmco: unref short failure +[h264 @ 0x55d9568d0900] mmco: unref short failure + [2024-11-28 05:51:05] iteration 651/ 1000 | consumed samples: 41664 | elapsed time per iteration (ms): 87819.5 | throughput per GPU (TFLOP/s/GPU): 87.8 | learning rate: 1.505405E-06 | global batch size: 64 | lm loss: 6.501545E-01 | loss scale: 1.0 | grad norm: 1.009 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555dedf89ec0] mmco: unref short failure +[h264 @ 0x55d957a91640] mmco: unref short failure +[h264 @ 0x55d956f36e40] mmco: unref short failure +[h264 @ 0x55d956f36e40] mmco: unref short failure +[h264 @ 0x555def251b40] mmco: unref short failure +[h264 @ 0x555def251b40] mmco: unref short failure +[h264 @ 0x55d956f36e40] mmco: unref short failure +[h264 @ 0x55d956f36e40] mmco: unref short failure +[h264 @ 0x555def251b40] mmco: unref short failure +[h264 @ 0x555def251b40] mmco: unref short failure +[h264 @ 0x555def251b40] mmco: unref short failure +[h264 @ 0x555def251b40] mmco: unref short failure +[h264 @ 0x55d956f36e40] mmco: unref short failure +[h264 @ 0x55d956f36e40] mmco: unref short failure +[h264 @ 0x555def251b40] mmco: unref short failure +[h264 @ 0x555def251b40] mmco: unref short failure +[h264 @ 0x55d956f36e40] mmco: unref short failure +[h264 @ 0x55d956f36e40] mmco: unref short failure +[h264 @ 0x555def251b40] mmco: unref short failure +[h264 @ 0x555def251b40] mmco: unref short failure +[h264 @ 0x55d956f36e40] mmco: unref short failure +[h264 @ 0x55d956f36e40] mmco: unref short failure +[h264 @ 0x555def251b40] mmco: unref short failure +[h264 @ 0x55d956f36e40] mmco: unref short failure +[h264 @ 0x555def251b40] mmco: unref short failure +[h264 @ 0x55d956f36e40] mmco: unref short failure +[h264 @ 0x555def216fc0] mmco: unref short failure +[h264 @ 0x555def216fc0] mmco: unref short failure +[h264 @ 0x55d9568d0900] mmco: unref short failure +[h264 @ 0x55d9568d0900] mmco: unref short failure +[h264 @ 0x555def216fc0] mmco: unref short failure +[h264 @ 0x555def216fc0] mmco: unref short failure +[h264 @ 0x55d9568d0900] mmco: unref short failure +[h264 @ 0x555def216fc0] mmco: unref short failure +[h264 @ 0x55d9568d0900] mmco: unref short failure +[h264 @ 0x555def216fc0] mmco: unref short failure +[h264 @ 0x55d9568d0900] mmco: unref short failure +[h264 @ 0x55d9568d0900] mmco: unref short failure + [2024-11-28 05:52:25] iteration 652/ 1000 | consumed samples: 41728 | elapsed time per iteration (ms): 80258.0 | throughput per GPU (TFLOP/s/GPU): 96.0 | learning rate: 1.498233E-06 | global batch size: 64 | lm loss: 6.332477E-01 | loss scale: 1.0 | grad norm: 0.816 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x55d9568d0900] mmco: unref short failure +[h264 @ 0x555def216fc0] mmco: unref short failure +[h264 @ 0x555def216fc0] mmco: unref short failure +[h264 @ 0x555def216fc0] mmco: unref short failure +[h264 @ 0x55d9568d0900] mmco: unref short failure +[h264 @ 0x55d9568d0900] mmco: unref short failure + [2024-11-28 05:53:44] iteration 653/ 1000 | consumed samples: 41792 | elapsed time per iteration (ms): 78288.0 | throughput per GPU (TFLOP/s/GPU): 98.5 | learning rate: 1.491072E-06 | global batch size: 64 | lm loss: 6.316616E-01 | loss scale: 1.0 | grad norm: 1.037 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x55d95742d280] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x55d95742d280] mmco: unref short failure +[h264 @ 0x555df4961e80] mmco: unref short failure +[h264 @ 0x555df4961e80] mmco: unref short failure +[h264 @ 0x55d955d4f640] mmco: unref short failure +[h264 @ 0x55d955d4f640] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x555df4961e80] mmco: unref short failure +[h264 @ 0x555df4961e80] mmco: unref short failure +[h264 @ 0x55d95742d280] mmco: unref short failure +[h264 @ 0x55d955d4f640] mmco: unref short failure +[h264 @ 0x55d955d4f640] mmco: unref short failure +[h264 @ 0x555df4961e80] mmco: unref short failure +[h264 @ 0x555df4961e80] mmco: unref short failure +[h264 @ 0x555df4961e80] mmco: unref short failure +[h264 @ 0x555df4961e80] mmco: unref short failure +[h264 @ 0x55d955d4f640] mmco: unref short failure +[h264 @ 0x55d955d4f640] mmco: unref short failure +[h264 @ 0x55d955d4f640] mmco: unref short failure +[h264 @ 0x555df4961e80] mmco: unref short failure +[h264 @ 0x55d955d4f640] mmco: unref short failure +[h264 @ 0x55d955d4f640] mmco: unref short failure +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x555dedf89ec0] mmco: unref short failure +[h264 @ 0x55d955e4acc0] mmco: unref short failure +[h264 @ 0x55d957a91640] mmco: unref short failure +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x55d955e4acc0] mmco: unref short failure +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x55d955e4acc0] mmco: unref short failure + [2024-11-28 05:54:57] iteration 654/ 1000 | consumed samples: 41856 | elapsed time per iteration (ms): 73018.5 | throughput per GPU (TFLOP/s/GPU): 105.6 | learning rate: 1.483922E-06 | global batch size: 64 | lm loss: 6.351939E-01 | loss scale: 1.0 | grad norm: 0.835 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555deebbd8c0] mmco: unref short failure +[h264 @ 0x555deebbd8c0] mmco: unref short failure +[h264 @ 0x55d956f36e40] mmco: unref short failure +[h264 @ 0x55d956f36e40] mmco: unref short failure +[h264 @ 0x555deebbd8c0] mmco: unref short failure +[h264 @ 0x555deebbd8c0] mmco: unref short failure +[h264 @ 0x55d956f36e40] mmco: unref short failure +[h264 @ 0x55d956f36e40] mmco: unref short failure +[h264 @ 0x55d955e4acc0] mmco: unref short failure +[h264 @ 0x55d955e4acc0] mmco: unref short failure +[h264 @ 0x555ded70ae00] mmco: unref short failure +[h264 @ 0x555ded70ae00] mmco: unref short failure +[h264 @ 0x555ded70ae00] mmco: unref short failure +[h264 @ 0x55d955e4acc0] mmco: unref short failure +[h264 @ 0x555deebbd8c0] mmco: unref short failure +[h264 @ 0x555deebbd8c0] mmco: unref short failure +[h264 @ 0x55d956f36e40] mmco: unref short failure +[h264 @ 0x55d956f36e40] mmco: unref short failure +[h264 @ 0x555ded70ae00] mmco: unref short failure +[h264 @ 0x55d955e4acc0] mmco: unref short failure +[h264 @ 0x555dee100dc0] mmco: unref short failure +[h264 @ 0x555dee100dc0] mmco: unref short failure +[h264 @ 0x55d9593709c0] mmco: unref short failure +[h264 @ 0x55d9593709c0] mmco: unref short failure +[h264 @ 0x555dee100dc0] mmco: unref short failure +[h264 @ 0x555dee100dc0] mmco: unref short failure +[h264 @ 0x555dee100dc0] mmco: unref short failure +[h264 @ 0x55d9593709c0] mmco: unref short failure +[h264 @ 0x55d9593709c0] mmco: unref short failure +[h264 @ 0x55d9593709c0] mmco: unref short failure +[h264 @ 0x555ded70ae00] mmco: unref short failure +[h264 @ 0x55d955e4acc0] mmco: unref short failure +[h264 @ 0x555ded70ae00] mmco: unref short failure +[h264 @ 0x55d955e4acc0] mmco: unref short failure +[h264 @ 0x55d95c76c1c0] mmco: unref short failure +[h264 @ 0x55d95c76c1c0] mmco: unref short failure +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x55d95c76c1c0] mmco: unref short failure +[h264 @ 0x55d95c76c1c0] mmco: unref short failure +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x55d95c76c1c0] mmco: unref short failure +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x55d9560849c0] mmco: unref short failure +[h264 @ 0x55d9560849c0] mmco: unref short failure + [2024-11-28 05:56:27] iteration 655/ 1000 | consumed samples: 41920 | elapsed time per iteration (ms): 90523.6 | throughput per GPU (TFLOP/s/GPU): 85.2 | learning rate: 1.476783E-06 | global batch size: 64 | lm loss: 7.357122E-01 | loss scale: 1.0 | grad norm: 0.958 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555ded70ae00] mmco: unref short failure +[h264 @ 0x555ded70ae00] mmco: unref short failure +[h264 @ 0x55d955e4acc0] mmco: unref short failure +[h264 @ 0x55d955e4acc0] mmco: unref short failure + [2024-11-28 05:58:09] iteration 656/ 1000 | consumed samples: 41984 | elapsed time per iteration (ms): 101719.0 | throughput per GPU (TFLOP/s/GPU): 75.8 | learning rate: 1.469656E-06 | global batch size: 64 | lm loss: 6.373831E-01 | loss scale: 1.0 | grad norm: 0.758 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x55d9566796c0] mmco: unref short failure +[h264 @ 0x55d9566796c0] mmco: unref short failure +[h264 @ 0x55d955d4f640] mmco: unref short failure +[h264 @ 0x555df4961e80] mmco: unref short failure +[h264 @ 0x55d955d4f640] mmco: unref short failure +[h264 @ 0x555df4961e80] mmco: unref short failure +[h264 @ 0x555dee100dc0] mmco: unref short failure +[h264 @ 0x555dee100dc0] mmco: unref short failure +[h264 @ 0x55d955b8bcc0] mmco: unref short failure +[h264 @ 0x555dee100dc0] mmco: unref short failure +[h264 @ 0x555dee100dc0] mmco: unref short failure +[h264 @ 0x55d955b8bcc0] mmco: unref short failure +[h264 @ 0x55d955b8bcc0] mmco: unref short failure +[h264 @ 0x55d955b8bcc0] mmco: unref short failure +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x55d95678a600] mmco: unref short failure + [2024-11-28 05:59:31] iteration 657/ 1000 | consumed samples: 42048 | elapsed time per iteration (ms): 81789.4 | throughput per GPU (TFLOP/s/GPU): 94.2 | learning rate: 1.462540E-06 | global batch size: 64 | lm loss: 7.237002E-01 | loss scale: 1.0 | grad norm: 1.506 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x55d9560849c0] mmco: unref short failure +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x55d957a91640] mmco: unref short failure +[h264 @ 0x555dee104680] mmco: unref short failure +[h264 @ 0x55d957a91640] mmco: unref short failure +[h264 @ 0x555dee104680] mmco: unref short failure +[h264 @ 0x55d957a91640] mmco: unref short failure +[h264 @ 0x55d957a91640] mmco: unref short failure +[h264 @ 0x555dee104680] mmco: unref short failure +[h264 @ 0x555dee104680] mmco: unref short failure +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x55d9560849c0] mmco: unref short failure +[h264 @ 0x55d9560849c0] mmco: unref short failure + [2024-11-28 06:00:56] iteration 658/ 1000 | consumed samples: 42112 | elapsed time per iteration (ms): 85319.2 | throughput per GPU (TFLOP/s/GPU): 90.3 | learning rate: 1.455435E-06 | global batch size: 64 | lm loss: 8.265692E-01 | loss scale: 1.0 | grad norm: 0.875 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x55d956f36e40] mmco: unref short failure +[h264 @ 0x55d956f36e40] mmco: unref short failure +[h264 @ 0x55d956f36e40] mmco: unref short failure +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x555ded1fff00] mmco: unref short failure +[h264 @ 0x555ded1fff00] mmco: unref short failure +[h264 @ 0x55d956386f00] mmco: unref short failure +[h264 @ 0x55d956386f00] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x55d95d112a80] mmco: unref short failure + [2024-11-28 06:02:07] iteration 659/ 1000 | consumed samples: 42176 | elapsed time per iteration (ms): 70835.9 | throughput per GPU (TFLOP/s/GPU): 108.8 | learning rate: 1.448341E-06 | global batch size: 64 | lm loss: 6.352268E-01 | loss scale: 1.0 | grad norm: 0.964 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555df4961e80] mmco: unref short failure +[h264 @ 0x55d955d4f640] mmco: unref short failure +[h264 @ 0x555df4961e80] mmco: unref short failure +[h264 @ 0x555df4961e80] mmco: unref short failure +[h264 @ 0x55d955d4f640] mmco: unref short failure +[h264 @ 0x55d955d4f640] mmco: unref short failure +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x55d956042040] mmco: unref short failure + [2024-11-28 06:03:38] iteration 660/ 1000 | consumed samples: 42240 | elapsed time per iteration (ms): 90985.5 | throughput per GPU (TFLOP/s/GPU): 84.7 | learning rate: 1.441260E-06 | global batch size: 64 | lm loss: 6.936147E-01 | loss scale: 1.0 | grad norm: 1.083 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555df4961e80] mmco: unref short failure +[h264 @ 0x555df4961e80] mmco: unref short failure +[h264 @ 0x555df4961e80] mmco: unref short failure +[h264 @ 0x555df4961e80] mmco: unref short failure +[h264 @ 0x555df4961e80] mmco: unref short failure +[h264 @ 0x55d955d4f640] mmco: unref short failure +[h264 @ 0x55d955d4f640] mmco: unref short failure +[h264 @ 0x55d955d4f640] mmco: unref short failure +[h264 @ 0x55d955d4f640] mmco: unref short failure +[h264 @ 0x55d955d4f640] mmco: unref short failure +[h264 @ 0x55d957a91640] mmco: unref short failure +[h264 @ 0x555dee104680] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x55d957f97780] mmco: unref short failure +[h264 @ 0x55d957f97780] mmco: unref short failure +[h264 @ 0x55d957a91640] mmco: unref short failure +[h264 @ 0x55d957a91640] mmco: unref short failure +[h264 @ 0x555dee104680] mmco: unref short failure +[h264 @ 0x555dee104680] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x55d957f97780] mmco: unref short failure +[h264 @ 0x55d957f97780] mmco: unref short failure +[h264 @ 0x55d957a91640] mmco: unref short failure +[h264 @ 0x555dee104680] mmco: unref short failure +[h264 @ 0x55d9560849c0] mmco: unref short failure +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x55d957f97780] mmco: unref short failure +[h264 @ 0x55d957f97780] mmco: unref short failure +[h264 @ 0x55d957a91640] mmco: unref short failure +[h264 @ 0x55d957a91640] mmco: unref short failure +[h264 @ 0x555dee104680] mmco: unref short failure +[h264 @ 0x555dee104680] mmco: unref short failure +[h264 @ 0x55d957a91640] mmco: unref short failure +[h264 @ 0x555dee104680] mmco: unref short failure +[h264 @ 0x55d957a91640] mmco: unref short failure +[h264 @ 0x555dee104680] mmco: unref short failure + [2024-11-28 06:05:10] iteration 661/ 1000 | consumed samples: 42304 | elapsed time per iteration (ms): 91563.5 | throughput per GPU (TFLOP/s/GPU): 84.2 | learning rate: 1.434190E-06 | global batch size: 64 | lm loss: 6.521118E-01 | loss scale: 1.0 | grad norm: 0.942 | number of skipped iterations: 0 | number of nan iterations: 0 | + [2024-11-28 06:06:19] iteration 662/ 1000 | consumed samples: 42368 | elapsed time per iteration (ms): 69403.8 | throughput per GPU (TFLOP/s/GPU): 111.1 | learning rate: 1.427131E-06 | global batch size: 64 | lm loss: 6.527104E-01 | loss scale: 1.0 | grad norm: 0.924 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x55d955d53340] mmco: unref short failure +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x55d955d53340] mmco: unref short failure +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x55d955d53340] mmco: unref short failure +[h264 @ 0x55d955d53340] mmco: unref short failure +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x55d955d53340] mmco: unref short failure +[h264 @ 0x555deebbd8c0] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x55d956f36e40] mmco: unref short failure +[h264 @ 0x55d957f98640] mmco: unref short failure +[h264 @ 0x55d957f98640] mmco: unref short failure +[h264 @ 0x555dee104680] mmco: unref short failure +[h264 @ 0x555dee104680] mmco: unref short failure +[h264 @ 0x555dee104680] mmco: unref short failure +[h264 @ 0x555dee104680] mmco: unref short failure +[h264 @ 0x55d957a91640] mmco: unref short failure +[h264 @ 0x55d957a91640] mmco: unref short failure +[h264 @ 0x55d957a91640] mmco: unref short failure +[h264 @ 0x55d957a91640] mmco: unref short failure +[h264 @ 0x555dee104680] mmco: unref short failure +[h264 @ 0x555dee104680] mmco: unref short failure +[h264 @ 0x55d957a91640] mmco: unref short failure +[h264 @ 0x55d957a91640] mmco: unref short failure +[h264 @ 0x555df4961e80] mmco: unref short failure +[h264 @ 0x555df4961e80] mmco: unref short failure +[h264 @ 0x55d955d4f640] mmco: unref short failure +[h264 @ 0x55d955d4f640] mmco: unref short failure +[h264 @ 0x555df4961e80] mmco: unref short failure +[h264 @ 0x55d955d4f640] mmco: unref short failure +[h264 @ 0x555df4961e80] mmco: unref short failure +[h264 @ 0x55d955d4f640] mmco: unref short failure + [2024-11-28 06:07:55] iteration 663/ 1000 | consumed samples: 42432 | elapsed time per iteration (ms): 95706.3 | throughput per GPU (TFLOP/s/GPU): 80.5 | learning rate: 1.420085E-06 | global batch size: 64 | lm loss: 6.427971E-01 | loss scale: 1.0 | grad norm: 0.742 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555deebbd8c0] mmco: unref short failure +[h264 @ 0x55d9586ad200] mmco: unref short failure +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x55d955d53340] mmco: unref short failure +[h264 @ 0x55d955d53340] mmco: unref short failure + [2024-11-28 06:10:29] iteration 664/ 1000 | consumed samples: 42496 | elapsed time per iteration (ms): 154602.0 | throughput per GPU (TFLOP/s/GPU): 49.9 | learning rate: 1.413050E-06 | global batch size: 64 | lm loss: 6.893479E-01 | loss scale: 1.0 | grad norm: 1.019 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555df4961e80] mmco: unref short failure +[h264 @ 0x555df4961e80] mmco: unref short failure +[h264 @ 0x55d9559dfc40] mmco: unref short failure +[h264 @ 0x55d9559dfc40] mmco: unref short failure +[h264 @ 0x555df4961e80] mmco: unref short failure +[h264 @ 0x55d9559dfc40] mmco: unref short failure +[h264 @ 0x55d956386f00] mmco: unref short failure +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x55d956386f00] mmco: unref short failure +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x55d955d53340] mmco: unref short failure +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x55d955d53340] mmco: unref short failure +[h264 @ 0x55d955d53340] mmco: unref short failure + [2024-11-28 06:12:11] iteration 665/ 1000 | consumed samples: 42560 | elapsed time per iteration (ms): 101420.4 | throughput per GPU (TFLOP/s/GPU): 76.0 | learning rate: 1.406027E-06 | global batch size: 64 | lm loss: 7.379445E-01 | loss scale: 1.0 | grad norm: 0.808 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x55d955d53340] mmco: unref short failure +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x55d955d53340] mmco: unref short failure +[h264 @ 0x555df4961e80] mmco: unref short failure +[h264 @ 0x555df4961e80] mmco: unref short failure +[h264 @ 0x55d9559dfc40] mmco: unref short failure +[h264 @ 0x55d9559dfc40] mmco: unref short failure +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x55d955d53340] mmco: unref short failure +[h264 @ 0x55d955d53340] mmco: unref short failure +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x55d955d53340] mmco: unref short failure +[h264 @ 0x55d955d53340] mmco: unref short failure +[h264 @ 0x55d9560849c0] mmco: unref short failure +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x55d955d53340] mmco: unref short failure +[h264 @ 0x55d9560849c0] mmco: unref short failure +[h264 @ 0x55d9560849c0] mmco: unref short failure +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x55d955d53340] mmco: unref short failure +[h264 @ 0x55d955d53340] mmco: unref short failure +[h264 @ 0x55d9560849c0] mmco: unref short failure +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x55d9560849c0] mmco: unref short failure +[h264 @ 0x55d9560849c0] mmco: unref short failure +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x555df4961e80] mmco: unref short failure +[h264 @ 0x555df4961e80] mmco: unref short failure +[h264 @ 0x55d956b88f40] mmco: unref short failure +[h264 @ 0x55d956b88f40] mmco: unref short failure +[h264 @ 0x555df4961e80] mmco: unref short failure +[h264 @ 0x555df4961e80] mmco: unref short failure +[h264 @ 0x55d956b88f40] mmco: unref short failure +[h264 @ 0x55d956b88f40] mmco: unref short failure + [2024-11-28 06:13:32] iteration 666/ 1000 | consumed samples: 42624 | elapsed time per iteration (ms): 81326.1 | throughput per GPU (TFLOP/s/GPU): 94.8 | learning rate: 1.399016E-06 | global batch size: 64 | lm loss: 6.550295E-01 | loss scale: 1.0 | grad norm: 0.836 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x55d956defa80] mmco: unref short failure +[h264 @ 0x55d956defa80] mmco: unref short failure +[h264 @ 0x55d956defa80] mmco: unref short failure +[h264 @ 0x55d956f36e40] mmco: unref short failure +[h264 @ 0x55d956f36e40] mmco: unref short failure +[h264 @ 0x555dee0f8a80] mmco: unref short failure +[h264 @ 0x555dee0f8a80] mmco: unref short failure + [2024-11-28 06:15:36] iteration 667/ 1000 | consumed samples: 42688 | elapsed time per iteration (ms): 123552.3 | throughput per GPU (TFLOP/s/GPU): 62.4 | learning rate: 1.392018E-06 | global batch size: 64 | lm loss: 7.040758E-01 | loss scale: 1.0 | grad norm: 0.817 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555def216fc0] mmco: unref short failure +[h264 @ 0x555def216fc0] mmco: unref short failure +[h264 @ 0x555def216fc0] mmco: unref short failure +[h264 @ 0x55d95873d100] mmco: unref short failure +[h264 @ 0x55d95873d100] mmco: unref short failure +[h264 @ 0x55d95873d100] mmco: unref short failure + [2024-11-28 06:17:52] iteration 668/ 1000 | consumed samples: 42752 | elapsed time per iteration (ms): 136390.8 | throughput per GPU (TFLOP/s/GPU): 56.5 | learning rate: 1.385031E-06 | global batch size: 64 | lm loss: 6.094788E-01 | loss scale: 1.0 | grad norm: 0.801 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555df4961e80] mmco: unref short failure +[h264 @ 0x55d956b88f40] mmco: unref short failure +[h264 @ 0x555deee52400] mmco: unref short failure +[h264 @ 0x555deee52400] mmco: unref short failure +[h264 @ 0x55d956defa80] mmco: unref short failure +[h264 @ 0x55d956defa80] mmco: unref short failure +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x55d956f98f80] mmco: unref short failure +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x55d956f98f80] mmco: unref short failure +[h264 @ 0x55d956f98f80] mmco: unref short failure +[h264 @ 0x555deee52400] mmco: unref short failure +[h264 @ 0x55d956defa80] mmco: unref short failure +[h264 @ 0x55d956b88f40] mmco: unref short failure +[h264 @ 0x555df4961e80] mmco: unref short failure +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x55d956f98f80] mmco: unref short failure +[h264 @ 0x55d956f98f80] mmco: unref short failure +[h264 @ 0x555df4961e80] mmco: unref short failure +[h264 @ 0x55d9559dfc40] mmco: unref short failure +[h264 @ 0x555df4961e80] mmco: unref short failure +[h264 @ 0x55d9559dfc40] mmco: unref short failure +[h264 @ 0x555df4961e80] mmco: unref short failure +[h264 @ 0x55d9559dfc40] mmco: unref short failure +[h264 @ 0x555df4961e80] mmco: unref short failure +[h264 @ 0x555df4961e80] mmco: unref short failure +[h264 @ 0x55d9559dfc40] mmco: unref short failure +[h264 @ 0x55d9559dfc40] mmco: unref short failure + [2024-11-28 06:19:23] iteration 669/ 1000 | consumed samples: 42816 | elapsed time per iteration (ms): 90558.5 | throughput per GPU (TFLOP/s/GPU): 85.1 | learning rate: 1.378057E-06 | global batch size: 64 | lm loss: 6.080279E-01 | loss scale: 1.0 | grad norm: 0.980 | number of skipped iterations: 0 | number of nan iterations: 0 | + [2024-11-28 06:20:37] iteration 670/ 1000 | consumed samples: 42880 | elapsed time per iteration (ms): 74584.3 | throughput per GPU (TFLOP/s/GPU): 103.4 | learning rate: 1.371094E-06 | global batch size: 64 | lm loss: 6.635122E-01 | loss scale: 1.0 | grad norm: 0.743 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555deee52400] mmco: unref short failure +[h264 @ 0x555deee52400] mmco: unref short failure +[h264 @ 0x55d956defa80] mmco: unref short failure +[h264 @ 0x55d956defa80] mmco: unref short failure +[h264 @ 0x555ded1fff00] mmco: unref short failure +[h264 @ 0x555ded1fff00] mmco: unref short failure +[h264 @ 0x55d956f36e40] mmco: unref short failure +[h264 @ 0x55d956f36e40] mmco: unref short failure +[h264 @ 0x555deee52400] mmco: unref short failure +[h264 @ 0x55d956defa80] mmco: unref short failure +[h264 @ 0x555deee52400] mmco: unref short failure +[h264 @ 0x555deee52400] mmco: unref short failure +[h264 @ 0x55d956defa80] mmco: unref short failure +[h264 @ 0x55d956defa80] mmco: unref short failure +[h264 @ 0x555deee52400] mmco: unref short failure +[h264 @ 0x55d956defa80] mmco: unref short failure +[h264 @ 0x555deee52400] mmco: unref short failure +[h264 @ 0x555deee52400] mmco: unref short failure +[h264 @ 0x55d956defa80] mmco: unref short failure +[h264 @ 0x55d956defa80] mmco: unref short failure + [2024-11-28 06:22:11] iteration 671/ 1000 | consumed samples: 42944 | elapsed time per iteration (ms): 93345.0 | throughput per GPU (TFLOP/s/GPU): 82.6 | learning rate: 1.364145E-06 | global batch size: 64 | lm loss: 6.329452E-01 | loss scale: 1.0 | grad norm: 0.888 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x55d95a292180] mmco: unref short failure +[h264 @ 0x55d95a292180] mmco: unref short failure +[h264 @ 0x555defd60880] mmco: unref short failure +[h264 @ 0x555defd60880] mmco: unref short failure + [2024-11-28 06:23:45] iteration 672/ 1000 | consumed samples: 43008 | elapsed time per iteration (ms): 93954.5 | throughput per GPU (TFLOP/s/GPU): 82.0 | learning rate: 1.357207E-06 | global batch size: 64 | lm loss: 6.652647E-01 | loss scale: 1.0 | grad norm: 1.427 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555deee52400] mmco: unref short failure +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x55d956f537c0] mmco: unref short failure +[h264 @ 0x55d956f98f80] mmco: unref short failure +[h264 @ 0x55d956f98f80] mmco: unref short failure +[h264 @ 0x555dec6aab00] Missing reference picture, default is 65530 +[h264 @ 0x555dec6aab00] Missing reference picture, default is 65530 +[h264 @ 0x555dec6aab00] mmco: unref short failure +[h264 @ 0x555dec6aab00] mmco: unref short failure +[h264 @ 0x55d95873d100] Missing reference picture, default is 65530 +[h264 @ 0x55d95873d100] Missing reference picture, default is 65530 +[h264 @ 0x55d95873d100] mmco: unref short failure +[h264 @ 0x55d95873d100] mmco: unref short failure +[h264 @ 0x555dec6aab00] mmco: unref short failure +[h264 @ 0x555dec6aab00] mmco: unref short failure +[h264 @ 0x55d95873d100] mmco: unref short failure +[h264 @ 0x55d95873d100] mmco: unref short failure +[h264 @ 0x555dec6aab00] Missing reference picture, default is 65530 +[h264 @ 0x555dec6aab00] mmco: unref short failure +[h264 @ 0x555dec6aab00] mmco: unref short failure +[h264 @ 0x555dec6aab00] Missing reference picture, default is 65530 +[h264 @ 0x555dec6aab00] mmco: unref short failure +[h264 @ 0x555dec6aab00] mmco: unref short failure +[h264 @ 0x55d95873d100] Missing reference picture, default is 65530 +[h264 @ 0x55d95873d100] mmco: unref short failure +[h264 @ 0x55d95873d100] mmco: unref short failure +[h264 @ 0x55d95873d100] Missing reference picture, default is 65530 +[h264 @ 0x55d95873d100] mmco: unref short failure +[h264 @ 0x55d95873d100] mmco: unref short failure + [2024-11-28 06:25:12] iteration 673/ 1000 | consumed samples: 43072 | elapsed time per iteration (ms): 87676.7 | throughput per GPU (TFLOP/s/GPU): 87.9 | learning rate: 1.350283E-06 | global batch size: 64 | lm loss: 6.648919E-01 | loss scale: 1.0 | grad norm: 0.909 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555ded1fff00] mmco: unref short failure +[h264 @ 0x55d956042040] mmco: unref short failure +[h264 @ 0x555defd60880] mmco: unref short failure +[h264 @ 0x55d95a292180] mmco: unref short failure +[h264 @ 0x555defd60880] mmco: unref short failure +[h264 @ 0x555defd60880] mmco: unref short failure +[h264 @ 0x55d95a292180] mmco: unref short failure +[h264 @ 0x55d95a292180] mmco: unref short failure +[h264 @ 0x555ded1fff00] mmco: unref short failure +[h264 @ 0x555ded1fff00] mmco: unref short failure +[h264 @ 0x55d956042040] mmco: unref short failure +[h264 @ 0x55d956042040] mmco: unref short failure +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x55d956f36840] mmco: unref short failure +[h264 @ 0x55d956f36840] mmco: unref short failure +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x55d956f36840] mmco: unref short failure + [2024-11-28 06:28:07] iteration 674/ 1000 | consumed samples: 43136 | elapsed time per iteration (ms): 175164.1 | throughput per GPU (TFLOP/s/GPU): 44.0 | learning rate: 1.343370E-06 | global batch size: 64 | lm loss: 6.360693E-01 | loss scale: 1.0 | grad norm: 0.905 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x55d956f98f80] mmco: unref short failure + [2024-11-28 06:29:28] iteration 675/ 1000 | consumed samples: 43200 | elapsed time per iteration (ms): 80818.5 | throughput per GPU (TFLOP/s/GPU): 95.4 | learning rate: 1.336471E-06 | global batch size: 64 | lm loss: 6.220077E-01 | loss scale: 1.0 | grad norm: 0.859 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555defd60880] mmco: unref short failure +[h264 @ 0x55d95a292180] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x55d95d112a80] mmco: unref short failure +[h264 @ 0x55d95d112a80] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x55d95d112a80] mmco: unref short failure +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x55d956f36840] mmco: unref short failure + [2024-11-28 06:31:06] iteration 676/ 1000 | consumed samples: 43264 | elapsed time per iteration (ms): 97698.5 | throughput per GPU (TFLOP/s/GPU): 78.9 | learning rate: 1.329584E-06 | global batch size: 64 | lm loss: 7.096108E-01 | loss scale: 1.0 | grad norm: 1.075 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x555deee52400] mmco: unref short failure +[h264 @ 0x55d956f98f80] mmco: unref short failure +[h264 @ 0x55d956aae040] mmco: unref short failure +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x55d956f98f80] mmco: unref short failure +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x55d956f98f80] mmco: unref short failure +[h264 @ 0x555defd60880] mmco: unref short failure +[h264 @ 0x55d95a292180] mmco: unref short failure +[h264 @ 0x555dec5a5140] mmco: unref short failure +[h264 @ 0x55d959cd93c0] mmco: unref short failure + [2024-11-28 06:32:31] iteration 677/ 1000 | consumed samples: 43328 | elapsed time per iteration (ms): 84755.2 | throughput per GPU (TFLOP/s/GPU): 91.0 | learning rate: 1.322710E-06 | global batch size: 64 | lm loss: 6.861854E-01 | loss scale: 1.0 | grad norm: 0.974 | number of skipped iterations: 0 | number of nan iterations: 0 | + [2024-11-28 06:33:44] iteration 678/ 1000 | consumed samples: 43392 | elapsed time per iteration (ms): 73412.7 | throughput per GPU (TFLOP/s/GPU): 105.0 | learning rate: 1.315849E-06 | global batch size: 64 | lm loss: 6.372385E-01 | loss scale: 1.0 | grad norm: 0.899 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x55d956f98f80] mmco: unref short failure +[h264 @ 0x55d956f98f80] mmco: unref short failure +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x55d956f98f80] mmco: unref short failure +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x55d956f98f80] mmco: unref short failure +[h264 @ 0x55d956f98f80] mmco: unref short failure +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x55d95873d100] mmco: unref short failure +[h264 @ 0x55d95873d100] mmco: unref short failure +[h264 @ 0x55d956f98f80] mmco: unref short failure +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x55d956f98f80] mmco: unref short failure +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x55d956f98f80] mmco: unref short failure +[h264 @ 0x55d956f98f80] mmco: unref short failure +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x55d956f98f80] mmco: unref short failure +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x55d956f98f80] mmco: unref short failure +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x55d956f98f80] mmco: unref short failure +[h264 @ 0x55d956f98f80] mmco: unref short failure + [2024-11-28 06:35:14] iteration 679/ 1000 | consumed samples: 43456 | elapsed time per iteration (ms): 90165.8 | throughput per GPU (TFLOP/s/GPU): 85.5 | learning rate: 1.309001E-06 | global batch size: 64 | lm loss: 7.036251E-01 | loss scale: 1.0 | grad norm: 1.326 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x55d956042040] mmco: unref short failure +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x55d956042040] mmco: unref short failure +[h264 @ 0x55d956042040] mmco: unref short failure +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x55d9559dfc40] mmco: unref short failure +[h264 @ 0x55d9559dfc40] mmco: unref short failure +[h264 @ 0x555dec5a5140] mmco: unref short failure +[h264 @ 0x555dec5a5140] mmco: unref short failure +[h264 @ 0x55d9559dfc40] mmco: unref short failure +[h264 @ 0x55d9559dfc40] mmco: unref short failure +[h264 @ 0x555dec5a5140] mmco: unref short failure +[h264 @ 0x555dec5a5140] mmco: unref short failure +[h264 @ 0x55d9559dfc40] mmco: unref short failure +[h264 @ 0x555dec5a5140] mmco: unref short failure +[h264 @ 0x55d956b4f800] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x55d956b4f800] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x55d956b4f800] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x55d956b4f800] mmco: unref short failure +[h264 @ 0x55d956b4f800] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x55d956b4f800] mmco: unref short failure +[h264 @ 0x55d956b4f800] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure + [2024-11-28 06:36:53] iteration 680/ 1000 | consumed samples: 43520 | elapsed time per iteration (ms): 98651.2 | throughput per GPU (TFLOP/s/GPU): 78.1 | learning rate: 1.302166E-06 | global batch size: 64 | lm loss: 6.906869E-01 | loss scale: 1.0 | grad norm: 0.888 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x55d956b9c3c0] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure + [2024-11-28 06:38:45] iteration 681/ 1000 | consumed samples: 43584 | elapsed time per iteration (ms): 112237.4 | throughput per GPU (TFLOP/s/GPU): 68.7 | learning rate: 1.295344E-06 | global batch size: 64 | lm loss: 6.447147E-01 | loss scale: 1.0 | grad norm: 0.741 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x55d957f0f640] mmco: unref short failure +[h264 @ 0x555dec5a5140] mmco: unref short failure +[h264 @ 0x55d957f0f640] mmco: unref short failure +[h264 @ 0x55d957f0f640] mmco: unref short failure +[h264 @ 0x555dec5a5140] mmco: unref short failure +[h264 @ 0x555dec5a5140] mmco: unref short failure +[h264 @ 0x55d957f0f640] mmco: unref short failure +[h264 @ 0x555dec5a5140] mmco: unref short failure +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x55d956f98f80] mmco: unref short failure +[h264 @ 0x55d956f98f80] mmco: unref short failure +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x55d956f98f80] mmco: unref short failure +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x55d956f98f80] mmco: unref short failure +[h264 @ 0x55d956f98f80] mmco: unref short failure +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x55d956f98f80] mmco: unref short failure +[h264 @ 0x55d956f98f80] mmco: unref short failure + [2024-11-28 06:40:02] iteration 682/ 1000 | consumed samples: 43648 | elapsed time per iteration (ms): 76368.4 | throughput per GPU (TFLOP/s/GPU): 100.9 | learning rate: 1.288535E-06 | global batch size: 64 | lm loss: 6.220345E-01 | loss scale: 1.0 | grad norm: 1.062 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x55d956f36e40] mmco: unref short failure +[h264 @ 0x55d956f36e40] mmco: unref short failure +[h264 @ 0x555df32c1f40] mmco: unref short failure +[h264 @ 0x555df32c1f40] mmco: unref short failure +[h264 @ 0x555df32c1f40] mmco: unref short failure +[h264 @ 0x55d956f36e40] mmco: unref short failure +[h264 @ 0x555df32c1f40] mmco: unref short failure +[h264 @ 0x55d956f36e40] mmco: unref short failure +[h264 @ 0x555df32c1f40] mmco: unref short failure +[h264 @ 0x55d956f36e40] mmco: unref short failure + [2024-11-28 06:41:12] iteration 683/ 1000 | consumed samples: 43712 | elapsed time per iteration (ms): 69860.0 | throughput per GPU (TFLOP/s/GPU): 110.3 | learning rate: 1.281739E-06 | global batch size: 64 | lm loss: 6.551774E-01 | loss scale: 1.0 | grad norm: 0.941 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x55d956b4f800] mmco: unref short failure +[h264 @ 0x555ded624e00] mmco: unref short failure +[h264 @ 0x555ded624e00] mmco: unref short failure +[h264 @ 0x55d956b4f800] mmco: unref short failure +[h264 @ 0x55d956b4f800] mmco: unref short failure +[h264 @ 0x55d956b4f800] mmco: unref short failure +[h264 @ 0x555ded624e00] mmco: unref short failure +[h264 @ 0x555ded624e00] mmco: unref short failure + [2024-11-28 06:42:27] iteration 684/ 1000 | consumed samples: 43776 | elapsed time per iteration (ms): 75777.7 | throughput per GPU (TFLOP/s/GPU): 101.7 | learning rate: 1.274957E-06 | global batch size: 64 | lm loss: 6.513221E-01 | loss scale: 1.0 | grad norm: 0.857 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x55d956f98f80] mmco: unref short failure +[h264 @ 0x55d956f98f80] mmco: unref short failure +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x555df333fac0] mmco: unref short failure + [2024-11-28 06:43:43] iteration 685/ 1000 | consumed samples: 43840 | elapsed time per iteration (ms): 76097.4 | throughput per GPU (TFLOP/s/GPU): 101.3 | learning rate: 1.268188E-06 | global batch size: 64 | lm loss: 6.644771E-01 | loss scale: 1.0 | grad norm: 0.821 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x55d957a03080] mmco: unref short failure +[h264 @ 0x555df4961e80] mmco: unref short failure +[h264 @ 0x55d95a292180] mmco: unref short failure +[h264 @ 0x555defd60880] mmco: unref short failure +[h264 @ 0x55d95a292180] mmco: unref short failure +[h264 @ 0x555defd60880] mmco: unref short failure +[h264 @ 0x555df4961e80] mmco: unref short failure +[h264 @ 0x555df4961e80] mmco: unref short failure +[h264 @ 0x55d957a03080] mmco: unref short failure +[h264 @ 0x55d957a03080] mmco: unref short failure +[h264 @ 0x555df4961e80] mmco: unref short failure +[h264 @ 0x55d957a03080] mmco: unref short failure +[h264 @ 0x555df4961e80] mmco: unref short failure +[h264 @ 0x555df4961e80] mmco: unref short failure +[h264 @ 0x55d957a03080] mmco: unref short failure +[h264 @ 0x55d957a03080] mmco: unref short failure +[h264 @ 0x555df4961e80] mmco: unref short failure +[h264 @ 0x55d956f98f80] mmco: unref short failure +[h264 @ 0x555df4961e80] mmco: unref short failure +[h264 @ 0x555df4961e80] mmco: unref short failure +[h264 @ 0x55d956f98f80] mmco: unref short failure +[h264 @ 0x55d956f98f80] mmco: unref short failure + [2024-11-28 06:45:07] iteration 686/ 1000 | consumed samples: 43904 | elapsed time per iteration (ms): 83094.6 | throughput per GPU (TFLOP/s/GPU): 92.8 | learning rate: 1.261432E-06 | global batch size: 64 | lm loss: 6.695704E-01 | loss scale: 1.0 | grad norm: 0.789 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x55d956b88f40] mmco: unref short failure +[h264 @ 0x55d956b88f40] mmco: unref short failure + [2024-11-28 06:46:36] iteration 687/ 1000 | consumed samples: 43968 | elapsed time per iteration (ms): 89651.0 | throughput per GPU (TFLOP/s/GPU): 86.0 | learning rate: 1.254690E-06 | global batch size: 64 | lm loss: 6.823508E-01 | loss scale: 1.0 | grad norm: 1.004 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555defd60880] mmco: unref short failure +[h264 @ 0x555defd60880] mmco: unref short failure +[h264 @ 0x55d95a292180] mmco: unref short failure +[h264 @ 0x55d95a292180] mmco: unref short failure +[h264 @ 0x55d956b88f40] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x55d956b88f40] mmco: unref short failure +[h264 @ 0x55d956b88f40] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x55d956b4f800] mmco: unref short failure +[h264 @ 0x555ded624e00] mmco: unref short failure +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure + [2024-11-28 06:48:00] iteration 688/ 1000 | consumed samples: 44032 | elapsed time per iteration (ms): 83603.2 | throughput per GPU (TFLOP/s/GPU): 92.2 | learning rate: 1.247961E-06 | global batch size: 64 | lm loss: 6.911366E-01 | loss scale: 1.0 | grad norm: 1.534 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x555dee1d4480] mmco: unref short failure +[h264 @ 0x555dee1d4480] mmco: unref short failure +[h264 @ 0x55d95a292180] mmco: unref short failure +[h264 @ 0x55d95a292180] mmco: unref short failure +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x55d956b88f40] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x555df32c1f40] mmco: unref short failure +[h264 @ 0x55d956f36e40] mmco: unref short failure +[h264 @ 0x555df32c1f40] mmco: unref short failure +[h264 @ 0x55d956f36e40] mmco: unref short failure +[h264 @ 0x555df32c1f40] mmco: unref short failure +[h264 @ 0x55d956f36e40] mmco: unref short failure +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure + [2024-11-28 06:49:24] iteration 689/ 1000 | consumed samples: 44096 | elapsed time per iteration (ms): 84296.1 | throughput per GPU (TFLOP/s/GPU): 91.4 | learning rate: 1.241247E-06 | global batch size: 64 | lm loss: 6.808408E-01 | loss scale: 1.0 | grad norm: 0.885 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x555df333fac0] mmco: unref short failure + [2024-11-28 06:50:49] iteration 690/ 1000 | consumed samples: 44160 | elapsed time per iteration (ms): 85310.0 | throughput per GPU (TFLOP/s/GPU): 90.4 | learning rate: 1.234546E-06 | global batch size: 64 | lm loss: 6.851189E-01 | loss scale: 1.0 | grad norm: 0.967 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x55d956b88f40] mmco: unref short failure +[h264 @ 0x55d956b88f40] mmco: unref short failure +[h264 @ 0x555ded624e00] mmco: unref short failure +[h264 @ 0x55d956b4f800] mmco: unref short failure + [2024-11-28 06:52:15] iteration 691/ 1000 | consumed samples: 44224 | elapsed time per iteration (ms): 85594.1 | throughput per GPU (TFLOP/s/GPU): 90.1 | learning rate: 1.227858E-06 | global batch size: 64 | lm loss: 6.419601E-01 | loss scale: 1.0 | grad norm: 0.782 | number of skipped iterations: 0 | number of nan iterations: 0 | + [2024-11-28 06:53:57] iteration 692/ 1000 | consumed samples: 44288 | elapsed time per iteration (ms): 102236.7 | throughput per GPU (TFLOP/s/GPU): 75.4 | learning rate: 1.221185E-06 | global batch size: 64 | lm loss: 6.681433E-01 | loss scale: 1.0 | grad norm: 0.821 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x55d957a91640] mmco: unref short failure +[h264 @ 0x55d957a91640] mmco: unref short failure +[h264 @ 0x555dece38b80] mmco: unref short failure +[h264 @ 0x555dece38b80] mmco: unref short failure +[h264 @ 0x55d956b88f40] mmco: unref short failure +[h264 @ 0x55d956b88f40] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x55d956b88f40] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x555dece38b80] mmco: unref short failure +[h264 @ 0x55d957a91640] mmco: unref short failure +[h264 @ 0x555def251b40] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x555dec811980] mmco: unref short failure +[h264 @ 0x555dec811980] mmco: unref short failure +[h264 @ 0x55d956767080] mmco: unref short failure +[h264 @ 0x55d956767080] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x55d956da5240] mmco: unref short failure +[h264 @ 0x55d956b88f40] mmco: unref short failure +[h264 @ 0x55d956b88f40] mmco: unref short failure +[h264 @ 0x55d956b88f40] mmco: unref short failure +[h264 @ 0x555def251b40] mmco: unref short failure +[h264 @ 0x555def251b40] mmco: unref short failure +[h264 @ 0x55d956da5240] mmco: unref short failure +[h264 @ 0x55d956da5240] mmco: unref short failure +[h264 @ 0x555def251b40] mmco: unref short failure +[h264 @ 0x555def251b40] mmco: unref short failure +[h264 @ 0x55d956da5240] mmco: unref short failure +[h264 @ 0x55d956da5240] mmco: unref short failure +[h264 @ 0x555dec811980] mmco: unref short failure +[h264 @ 0x555dec811980] mmco: unref short failure +[h264 @ 0x55d956767080] mmco: unref short failure +[h264 @ 0x55d956767080] mmco: unref short failure +[h264 @ 0x555dec811980] mmco: unref short failure +[h264 @ 0x55d956767080] mmco: unref short failure +[h264 @ 0x555dec811980] mmco: unref short failure +[h264 @ 0x55d956767080] mmco: unref short failure +[h264 @ 0x555dec811980] mmco: unref short failure +[h264 @ 0x55d956767080] mmco: unref short failure +[h264 @ 0x555dec811980] mmco: unref short failure +[h264 @ 0x555dec811980] mmco: unref short failure +[h264 @ 0x55d956767080] mmco: unref short failure +[h264 @ 0x55d956767080] mmco: unref short failure + [2024-11-28 06:55:54] iteration 693/ 1000 | consumed samples: 44352 | elapsed time per iteration (ms): 116496.4 | throughput per GPU (TFLOP/s/GPU): 66.2 | learning rate: 1.214525E-06 | global batch size: 64 | lm loss: 6.374875E-01 | loss scale: 1.0 | grad norm: 0.835 | number of skipped iterations: 0 | number of nan iterations: 0 | + [2024-11-28 06:57:15] iteration 694/ 1000 | consumed samples: 44416 | elapsed time per iteration (ms): 81088.3 | throughput per GPU (TFLOP/s/GPU): 95.1 | learning rate: 1.207880E-06 | global batch size: 64 | lm loss: 6.311159E-01 | loss scale: 1.0 | grad norm: 1.018 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x55d957a91640] mmco: unref short failure +[h264 @ 0x555dece38b80] mmco: unref short failure +[h264 @ 0x55d957a91640] mmco: unref short failure +[h264 @ 0x555dece38b80] mmco: unref short failure +[h264 @ 0x55d957a91640] mmco: unref short failure +[h264 @ 0x55d957a91640] mmco: unref short failure +[h264 @ 0x555dece38b80] mmco: unref short failure +[h264 @ 0x555dece38b80] mmco: unref short failure +[h264 @ 0x55d956042040] mmco: unref short failure +[h264 @ 0x55d956042040] mmco: unref short failure +[h264 @ 0x555ded63c840] mmco: unref short failure +[h264 @ 0x555ded63c840] mmco: unref short failure +[h264 @ 0x55d956042040] mmco: unref short failure +[h264 @ 0x555ded63c840] mmco: unref short failure +[h264 @ 0x555ded63c840] mmco: unref short failure +[h264 @ 0x55d956042040] mmco: unref short failure + [2024-11-28 06:58:26] iteration 695/ 1000 | consumed samples: 44480 | elapsed time per iteration (ms): 71111.0 | throughput per GPU (TFLOP/s/GPU): 108.4 | learning rate: 1.201249E-06 | global batch size: 64 | lm loss: 6.946983E-01 | loss scale: 1.0 | grad norm: 0.929 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555dec5a5140] mmco: unref short failure +[h264 @ 0x555dec5a5140] mmco: unref short failure +[h264 @ 0x55d957fbfd40] mmco: unref short failure +[h264 @ 0x55d957fbfd40] mmco: unref short failure +[h264 @ 0x555dec5a5140] mmco: unref short failure +[h264 @ 0x555dec5a5140] mmco: unref short failure +[h264 @ 0x55d957fbfd40] mmco: unref short failure +[h264 @ 0x55d957fbfd40] mmco: unref short failure +processed_samples 2500 unjoint_samples 2500 joint_samples 168 [120290, 108487] +processed_samples 2500 unjoint_samples 2500 joint_samples 173 [88757, 63354] +processed_samples 2500 unjoint_samples 2500 joint_samples 169 [110246, 71865] +processed_samples 2500 unjoint_samples 2500 joint_samples 180 [102941, 84465] +processed_samples 2500 unjoint_samples 2500 joint_samples 174 [125182, 14057] +processed_samples 2500 unjoint_samples 2500 joint_samples 171 [96609, 73106] +processed_samples 2500 unjoint_samples 2500 joint_samples 178 [97503, 125668] +processed_samples 2500 unjoint_samples 2500 joint_samples 168 [120290, 108487] +processed_samples 2500 unjoint_samples 2500 joint_samples 173 [88757, 63354] +processed_samples 2500 unjoint_samples 2500 joint_samples 169 [110246, 71865] +processed_samples 2500 unjoint_samples 2500 joint_samples 180 [102941, 84465] +processed_samples 2500 unjoint_samples 2500 joint_samples 174 [125182, 14057] +processed_samples 2500 unjoint_samples 2500 joint_samples 171 [96609, 73106] +processed_samples 2500 unjoint_samples 2500 joint_samples 178 [97503, 125668] +processed_samples 2500 unjoint_samples 2500 joint_samples 179 [119274, 75578] +processed_samples 2500 unjoint_samples 2500 joint_samples 179 [119274, 75578] + [2024-11-28 06:59:58] iteration 696/ 1000 | consumed samples: 44544 | elapsed time per iteration (ms): 92412.2 | throughput per GPU (TFLOP/s/GPU): 83.4 | learning rate: 1.194631E-06 | global batch size: 64 | lm loss: 6.888507E-01 | loss scale: 1.0 | grad norm: 1.813 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555df32c1f40] mmco: unref short failure +[h264 @ 0x555df32c1f40] mmco: unref short failure +[h264 @ 0x55d956defa80] mmco: unref short failure +[h264 @ 0x55d956defa80] mmco: unref short failure +[h264 @ 0x555df32c1f40] mmco: unref short failure +[h264 @ 0x55d956defa80] mmco: unref short failure +[h264 @ 0x55d956defa80] mmco: unref short failure +[h264 @ 0x55d956defa80] mmco: unref short failure +[h264 @ 0x555df32c1f40] mmco: unref short failure +[h264 @ 0x555df32c1f40] mmco: unref short failure +[h264 @ 0x55d956defa80] mmco: unref short failure +[h264 @ 0x55d956defa80] mmco: unref short failure +[h264 @ 0x555df32c1f40] mmco: unref short failure +[h264 @ 0x555df32c1f40] mmco: unref short failure +[h264 @ 0x55d956defa80] mmco: unref short failure +[h264 @ 0x55d956defa80] mmco: unref short failure +[h264 @ 0x555df32c1f40] mmco: unref short failure +[h264 @ 0x555df32c1f40] mmco: unref short failure +[h264 @ 0x55d956defa80] mmco: unref short failure +[h264 @ 0x55d956defa80] mmco: unref short failure +[h264 @ 0x555df32c1f40] mmco: unref short failure +[h264 @ 0x555df32c1f40] mmco: unref short failure +[h264 @ 0x555df32c1f40] mmco: unref short failure +[h264 @ 0x55d956defa80] mmco: unref short failure +[h264 @ 0x555df32c1f40] mmco: unref short failure +[h264 @ 0x555df32c1f40] mmco: unref short failure +[h264 @ 0x55d956defa80] mmco: unref short failure +[h264 @ 0x55d956defa80] mmco: unref short failure +[h264 @ 0x555df32c1f40] mmco: unref short failure +[h264 @ 0x55d956defa80] mmco: unref short failure +[h264 @ 0x555dece38b80] mmco: unref short failure +[h264 @ 0x555dece38b80] mmco: unref short failure +[h264 @ 0x55d957a91640] mmco: unref short failure +[h264 @ 0x55d957a91640] mmco: unref short failure +[h264 @ 0x555dece38b80] mmco: unref short failure +[h264 @ 0x55d957a91640] mmco: unref short failure + [2024-11-28 07:01:23] iteration 697/ 1000 | consumed samples: 44608 | elapsed time per iteration (ms): 84033.4 | throughput per GPU (TFLOP/s/GPU): 91.7 | learning rate: 1.188028E-06 | global batch size: 64 | lm loss: 6.656145E-01 | loss scale: 1.0 | grad norm: 0.968 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x55d956b88f40] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x55d956b88f40] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x55d956b88f40] mmco: unref short failure +[h264 @ 0x55d956b88f40] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x55d956b88f40] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x55d95b718d00] mmco: unref short failure +[h264 @ 0x55d95b718d00] mmco: unref short failure +[h264 @ 0x555deee52400] mmco: unref short failure +[h264 @ 0x555deee52400] mmco: unref short failure +[h264 @ 0x55d957fbfd40] mmco: unref short failure +[h264 @ 0x555dec5a5140] mmco: unref short failure +[h264 @ 0x55d957fbfd40] mmco: unref short failure +[h264 @ 0x55d957fbfd40] mmco: unref short failure +[h264 @ 0x555dec5a5140] mmco: unref short failure +[h264 @ 0x555dec5a5140] mmco: unref short failure +[h264 @ 0x55d957fbfd40] mmco: unref short failure +[h264 @ 0x555dec5a5140] mmco: unref short failure +[h264 @ 0x555dec5a5140] mmco: unref short failure +[h264 @ 0x555dec5a5140] mmco: unref short failure +[h264 @ 0x55d957fbfd40] mmco: unref short failure +[h264 @ 0x55d957fbfd40] mmco: unref short failure +[h264 @ 0x555dec5a5140] mmco: unref short failure +[h264 @ 0x55d957fbfd40] mmco: unref short failure +[h264 @ 0x555dec5a5140] mmco: unref short failure +[h264 @ 0x55d957fbfd40] mmco: unref short failure +[h264 @ 0x555dec5a5140] mmco: unref short failure +[h264 @ 0x555dec5a5140] mmco: unref short failure +[h264 @ 0x55d957883f80] mmco: unref short failure +[h264 @ 0x55d957883f80] mmco: unref short failure + [2024-11-28 07:02:45] iteration 698/ 1000 | consumed samples: 44672 | elapsed time per iteration (ms): 82325.8 | throughput per GPU (TFLOP/s/GPU): 93.6 | learning rate: 1.181440E-06 | global batch size: 64 | lm loss: 6.377969E-01 | loss scale: 1.0 | grad norm: 0.987 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x55d95b718d00] mmco: unref short failure +[h264 @ 0x555deee52400] mmco: unref short failure +[h264 @ 0x55d957a91640] mmco: unref short failure +[h264 @ 0x55d957a91640] mmco: unref short failure +[h264 @ 0x555dece38b80] mmco: unref short failure +[h264 @ 0x555dece38b80] mmco: unref short failure +[h264 @ 0x555deee52400] mmco: unref short failure +[h264 @ 0x55d95b718d00] mmco: unref short failure +[h264 @ 0x555deee52400] mmco: unref short failure +[h264 @ 0x555deee52400] mmco: unref short failure +[h264 @ 0x55d95b718d00] mmco: unref short failure +[h264 @ 0x55d95b718d00] mmco: unref short failure +[h264 @ 0x555deee52400] mmco: unref short failure +[h264 @ 0x555deee52400] mmco: unref short failure +[h264 @ 0x55d95b718d00] mmco: unref short failure +[h264 @ 0x55d95b718d00] mmco: unref short failure +[h264 @ 0x55d957883f80] mmco: unref short failure +[h264 @ 0x555dec5a5140] mmco: unref short failure +[h264 @ 0x555deee52400] mmco: unref short failure +[h264 @ 0x55d95b718d00] mmco: unref short failure +[h264 @ 0x555deee52400] mmco: unref short failure +[h264 @ 0x555deee52400] mmco: unref short failure +[h264 @ 0x55d95b718d00] mmco: unref short failure +[h264 @ 0x55d95b718d00] mmco: unref short failure +[h264 @ 0x555deee52400] mmco: unref short failure +[h264 @ 0x55d955b8bcc0] mmco: unref short failure +[h264 @ 0x555deee52400] mmco: unref short failure +[h264 @ 0x555deee52400] mmco: unref short failure +[h264 @ 0x55d95b718d00] mmco: unref short failure +[h264 @ 0x55d95b718d00] mmco: unref short failure +[h264 @ 0x555deee52400] mmco: unref short failure +[h264 @ 0x55d95b718d00] mmco: unref short failure +[h264 @ 0x555deee52400] mmco: unref short failure +[h264 @ 0x555deee52400] mmco: unref short failure +[h264 @ 0x55d95b718d00] mmco: unref short failure +[h264 @ 0x55d95b718d00] mmco: unref short failure +[h264 @ 0x55d956b4f800] mmco: unref short failure +[h264 @ 0x55d956b4f800] mmco: unref short failure +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x55d957a91640] mmco: unref short failure +[h264 @ 0x555ded63c840] mmco: unref short failure +[h264 @ 0x55d957a91640] mmco: unref short failure +[h264 @ 0x55d957a91640] mmco: unref short failure +[h264 @ 0x55d955b8bcc0] mmco: unref short failure +[h264 @ 0x555deee52400] mmco: unref short failure +[h264 @ 0x555ded63c840] mmco: unref short failure +[h264 @ 0x555ded63c840] mmco: unref short failure +[h264 @ 0x55d955b8bcc0] mmco: unref short failure +[h264 @ 0x555deee52400] mmco: unref short failure +[h264 @ 0x55d955b8bcc0] mmco: unref short failure +[h264 @ 0x55d955b8bcc0] mmco: unref short failure +[h264 @ 0x555deee52400] mmco: unref short failure +[h264 @ 0x555deee52400] mmco: unref short failure +[h264 @ 0x55d955b8bcc0] mmco: unref short failure +[h264 @ 0x55d955b8bcc0] mmco: unref short failure +[h264 @ 0x555deee52400] mmco: unref short failure +[h264 @ 0x555deee52400] mmco: unref short failure + [2024-11-28 07:04:21] iteration 699/ 1000 | consumed samples: 44736 | elapsed time per iteration (ms): 96343.7 | throughput per GPU (TFLOP/s/GPU): 80.0 | learning rate: 1.174865E-06 | global batch size: 64 | lm loss: 6.097525E-01 | loss scale: 1.0 | grad norm: 0.816 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555dec5a5140] mmco: unref short failure +[h264 @ 0x55d95b718d00] mmco: unref short failure +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x55d955b8bcc0] mmco: unref short failure +[h264 @ 0x55d955b8bcc0] mmco: unref short failure + [2024-11-28 07:05:35] iteration 700/ 1000 | consumed samples: 44800 | elapsed time per iteration (ms): 73619.4 | throughput per GPU (TFLOP/s/GPU): 104.7 | learning rate: 1.168305E-06 | global batch size: 64 | lm loss: 5.898384E-01 | loss scale: 1.0 | grad norm: 0.878 | number of skipped iterations: 0 | number of nan iterations: 0 | +(min, max) time across ranks (ms): + save-checkpoint ................................: (266265.05, 266265.44) + [2024-11-28 07:11:23] iteration 701/ 1000 | consumed samples: 44864 | elapsed time per iteration (ms): 82012.4 | throughput per GPU (TFLOP/s/GPU): 94.0 | learning rate: 1.161760E-06 | global batch size: 64 | lm loss: 7.252907E-01 | loss scale: 1.0 | grad norm: 0.954 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555df4961e80] mmco: unref short failure +[h264 @ 0x555df4961e80] mmco: unref short failure +[h264 @ 0x55d957883f80] mmco: unref short failure +[h264 @ 0x55d957883f80] mmco: unref short failure +[h264 @ 0x555df4961e80] mmco: unref short failure +[h264 @ 0x555df4961e80] mmco: unref short failure +[h264 @ 0x55d957883f80] mmco: unref short failure +[h264 @ 0x55d957883f80] mmco: unref short failure +[h264 @ 0x555df4961e80] mmco: unref short failure +[h264 @ 0x55d957883f80] mmco: unref short failure + [2024-11-28 07:12:57] iteration 702/ 1000 | consumed samples: 44928 | elapsed time per iteration (ms): 93619.3 | throughput per GPU (TFLOP/s/GPU): 82.3 | learning rate: 1.155229E-06 | global batch size: 64 | lm loss: 6.619033E-01 | loss scale: 1.0 | grad norm: 0.881 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x55d956b4f800] mmco: unref short failure +[h264 @ 0x55d956b4f800] mmco: unref short failure +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x55d955b8bcc0] mmco: unref short failure +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x55d955b8bcc0] mmco: unref short failure +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x55d955b8bcc0] mmco: unref short failure +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x55d955b8bcc0] mmco: unref short failure +[h264 @ 0x55d955b8bcc0] mmco: unref short failure +[h264 @ 0x55d955b8bcc0] mmco: unref short failure +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x55d955b8bcc0] mmco: unref short failure +[h264 @ 0x55d955b8bcc0] mmco: unref short failure +[h264 @ 0x555ded63c840] mmco: unref short failure +[h264 @ 0x555ded63c840] mmco: unref short failure +[h264 @ 0x55d957a91640] mmco: unref short failure +[h264 @ 0x55d957a91640] mmco: unref short failure + [2024-11-28 07:15:00] iteration 703/ 1000 | consumed samples: 44992 | elapsed time per iteration (ms): 122806.3 | throughput per GPU (TFLOP/s/GPU): 62.8 | learning rate: 1.148713E-06 | global batch size: 64 | lm loss: 6.486115E-01 | loss scale: 1.0 | grad norm: 0.805 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555dedb7a200] mmco: unref short failure +[h264 @ 0x555dedb7a200] mmco: unref short failure +[h264 @ 0x55d956da5240] mmco: unref short failure +[h264 @ 0x55d956da5240] mmco: unref short failure + [2024-11-28 07:16:23] iteration 704/ 1000 | consumed samples: 45056 | elapsed time per iteration (ms): 83280.2 | throughput per GPU (TFLOP/s/GPU): 92.6 | learning rate: 1.142211E-06 | global batch size: 64 | lm loss: 6.365772E-01 | loss scale: 1.0 | grad norm: 1.116 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x55d957a91640] mmco: unref short failure +[h264 @ 0x55d957a91640] mmco: unref short failure +[h264 @ 0x555ded63c840] mmco: unref short failure +[h264 @ 0x555ded63c840] mmco: unref short failure +[h264 @ 0x555dec5a5140] mmco: unref short failure +[h264 @ 0x55d95b718d00] mmco: unref short failure +[h264 @ 0x55d9567ebec0] mmco: unref short failure +[h264 @ 0x55d9567ebec0] mmco: unref short failure +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x55d9567ebec0] mmco: unref short failure +[h264 @ 0x55d9567ebec0] mmco: unref short failure +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x55d9567ebec0] mmco: unref short failure +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x55d9567ebec0] mmco: unref short failure +[h264 @ 0x55d9567ebec0] mmco: unref short failure +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x55d9567ebec0] mmco: unref short failure +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x555df1590d80] mmco: unref short failure +[h264 @ 0x555df1590d80] mmco: unref short failure +[h264 @ 0x55d95b718d00] mmco: unref short failure +[h264 @ 0x55d95b718d00] mmco: unref short failure +[h264 @ 0x555dec5a5140] mmco: unref short failure +[h264 @ 0x55d95b718d00] mmco: unref short failure +[h264 @ 0x555ded63c840] mmco: unref short failure +[h264 @ 0x555ded63c840] mmco: unref short failure +[h264 @ 0x555ded63c840] mmco: unref short failure +[h264 @ 0x55d957a91640] mmco: unref short failure +[h264 @ 0x55d957a91640] mmco: unref short failure +[h264 @ 0x555ded63c840] mmco: unref short failure +[h264 @ 0x55d957a91640] mmco: unref short failure +[h264 @ 0x555ded63c840] mmco: unref short failure +[h264 @ 0x55d957a91640] mmco: unref short failure +[h264 @ 0x55d957a91640] mmco: unref short failure +[h264 @ 0x555dec5a5140] mmco: unref short failure +[h264 @ 0x555dec5a5140] mmco: unref short failure +[h264 @ 0x55d95b718d00] mmco: unref short failure +[h264 @ 0x55d95b718d00] mmco: unref short failure + [2024-11-28 07:18:33] iteration 705/ 1000 | consumed samples: 45120 | elapsed time per iteration (ms): 130359.9 | throughput per GPU (TFLOP/s/GPU): 59.1 | learning rate: 1.135724E-06 | global batch size: 64 | lm loss: 6.714696E-01 | loss scale: 1.0 | grad norm: 1.007 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x55d95678a600] mmco: unref short failure +[h264 @ 0x55d95678a600] mmco: unref short failure +[h264 @ 0x555deee52400] mmco: unref short failure +[h264 @ 0x55d95b718d00] mmco: unref short failure + [2024-11-28 07:20:11] iteration 706/ 1000 | consumed samples: 45184 | elapsed time per iteration (ms): 97891.1 | throughput per GPU (TFLOP/s/GPU): 78.7 | learning rate: 1.129252E-06 | global batch size: 64 | lm loss: 7.067757E-01 | loss scale: 1.0 | grad norm: 0.749 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555deee52400] mmco: unref short failure +[h264 @ 0x55d95b718d00] mmco: unref short failure +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x55d956b4f800] mmco: unref short failure +[h264 @ 0x55d956b4f800] mmco: unref short failure + [2024-11-28 07:21:25] iteration 707/ 1000 | consumed samples: 45248 | elapsed time per iteration (ms): 73520.2 | throughput per GPU (TFLOP/s/GPU): 104.8 | learning rate: 1.122795E-06 | global batch size: 64 | lm loss: 6.495814E-01 | loss scale: 1.0 | grad norm: 1.062 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x55d959a05200] mmco: unref short failure +[h264 @ 0x55d959a05200] mmco: unref short failure +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x55d959a05200] mmco: unref short failure + [2024-11-28 07:22:54] iteration 708/ 1000 | consumed samples: 45312 | elapsed time per iteration (ms): 88974.9 | throughput per GPU (TFLOP/s/GPU): 86.6 | learning rate: 1.116353E-06 | global batch size: 64 | lm loss: 6.196807E-01 | loss scale: 1.0 | grad norm: 0.756 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555dec5a5140] mmco: unref short failure +[h264 @ 0x555dec5a5140] mmco: unref short failure +[h264 @ 0x55d95b718d00] mmco: unref short failure +[h264 @ 0x55d95b718d00] mmco: unref short failure +[h264 @ 0x555deda84c00] mmco: unref short failure +[h264 @ 0x555deda84c00] mmco: unref short failure +[h264 @ 0x55d957cd6140] mmco: unref short failure +[h264 @ 0x55d957cd6140] mmco: unref short failure +[h264 @ 0x55d957cd6140] mmco: unref short failure +[h264 @ 0x555deda84c00] mmco: unref short failure +[h264 @ 0x555dec5a5140] mmco: unref short failure +[h264 @ 0x555dec5a5140] mmco: unref short failure +[h264 @ 0x55d95b718d00] mmco: unref short failure +[h264 @ 0x55d95b718d00] mmco: unref short failure +[h264 @ 0x555dec5a5140] mmco: unref short failure +[h264 @ 0x55d95b718d00] mmco: unref short failure + [2024-11-28 07:24:42] iteration 709/ 1000 | consumed samples: 45376 | elapsed time per iteration (ms): 108553.4 | throughput per GPU (TFLOP/s/GPU): 71.0 | learning rate: 1.109926E-06 | global batch size: 64 | lm loss: 6.966307E-01 | loss scale: 1.0 | grad norm: 1.342 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555dec5a5140] mmco: unref short failure +[h264 @ 0x55d957fbfd40] mmco: unref short failure +[h264 @ 0x555ded0cd540] mmco: unref short failure +[h264 @ 0x555ded0cd540] mmco: unref short failure +[h264 @ 0x555dece9f180] mmco: unref short failure +[h264 @ 0x555dece9f180] mmco: unref short failure +[h264 @ 0x555dedb7a200] mmco: unref short failure +[h264 @ 0x555dedb7a200] mmco: unref short failure +[h264 @ 0x55d956042040] mmco: unref short failure +[h264 @ 0x55d956042040] mmco: unref short failure +[h264 @ 0x55d956b4f800] mmco: unref short failure +[h264 @ 0x55d956b4f800] mmco: unref short failure +[h264 @ 0x55d956da5240] mmco: unref short failure +[h264 @ 0x55d956da5240] mmco: unref short failure +[h264 @ 0x555deda84c00] mmco: unref short failure +[h264 @ 0x55d956f98f80] mmco: unref short failure +[h264 @ 0x555deda84c00] mmco: unref short failure +[h264 @ 0x55d956f98f80] mmco: unref short failure +[h264 @ 0x555deda84c00] mmco: unref short failure +[h264 @ 0x55d956f98f80] mmco: unref short failure +[h264 @ 0x555dec5a5140] mmco: unref short failure +[h264 @ 0x55d957fbfd40] mmco: unref short failure + [2024-11-28 07:26:25] iteration 710/ 1000 | consumed samples: 45440 | elapsed time per iteration (ms): 102631.6 | throughput per GPU (TFLOP/s/GPU): 75.1 | learning rate: 1.103514E-06 | global batch size: 64 | lm loss: 6.217573E-01 | loss scale: 1.0 | grad norm: 0.913 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555dedb7a200] mmco: unref short failure +[h264 @ 0x55d95b718d00] mmco: unref short failure + [2024-11-28 07:27:56] iteration 711/ 1000 | consumed samples: 45504 | elapsed time per iteration (ms): 91479.5 | throughput per GPU (TFLOP/s/GPU): 84.3 | learning rate: 1.097117E-06 | global batch size: 64 | lm loss: 6.903121E-01 | loss scale: 1.0 | grad norm: 0.802 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x55d956b4f800] mmco: unref short failure +[h264 @ 0x55d956b4f800] mmco: unref short failure +[h264 @ 0x55d956b4f800] mmco: unref short failure +[h264 @ 0x55d956b4f800] mmco: unref short failure +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x55d956b4f800] mmco: unref short failure +[h264 @ 0x55d956b4f800] mmco: unref short failure +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x55d956b4f800] mmco: unref short failure +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x55d956b4f800] mmco: unref short failure +[h264 @ 0x55d956b4f800] mmco: unref short failure +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x555df4169bc0] mmco: unref short failure + [2024-11-28 07:29:15] iteration 712/ 1000 | consumed samples: 45568 | elapsed time per iteration (ms): 78703.9 | throughput per GPU (TFLOP/s/GPU): 97.9 | learning rate: 1.090736E-06 | global batch size: 64 | lm loss: 7.672721E-01 | loss scale: 1.0 | grad norm: 0.860 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x55d95678a600] mmco: unref short failure +[h264 @ 0x55d95678a600] mmco: unref short failure +[h264 @ 0x55d95678a600] mmco: unref short failure +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x55d95678a600] mmco: unref short failure +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x555dedb7a200] mmco: unref short failure +[h264 @ 0x555dedb7a200] mmco: unref short failure +[h264 @ 0x55d95b718d00] mmco: unref short failure +[h264 @ 0x55d95b718d00] mmco: unref short failure +[h264 @ 0x555dec5a5140] mmco: unref short failure +[h264 @ 0x555dec5a5140] mmco: unref short failure +[h264 @ 0x55d957fbfd40] mmco: unref short failure +[h264 @ 0x55d957fbfd40] mmco: unref short failure +[h264 @ 0x555deda84c00] mmco: unref short failure +[h264 @ 0x55d956f36e40] mmco: unref short failure + [2024-11-28 07:30:49] iteration 713/ 1000 | consumed samples: 45632 | elapsed time per iteration (ms): 94197.3 | throughput per GPU (TFLOP/s/GPU): 81.8 | learning rate: 1.084370E-06 | global batch size: 64 | lm loss: 6.627736E-01 | loss scale: 1.0 | grad norm: 0.834 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555dedf05880] mmco: unref short failure +[h264 @ 0x55d956767080] mmco: unref short failure +[h264 @ 0x555ded63c840] mmco: unref short failure +[h264 @ 0x555ded63c840] mmco: unref short failure +[h264 @ 0x55d957a91640] mmco: unref short failure +[h264 @ 0x55d957a91640] mmco: unref short failure + [2024-11-28 07:32:21] iteration 714/ 1000 | consumed samples: 45696 | elapsed time per iteration (ms): 91448.9 | throughput per GPU (TFLOP/s/GPU): 84.3 | learning rate: 1.078019E-06 | global batch size: 64 | lm loss: 6.277137E-01 | loss scale: 1.0 | grad norm: 0.870 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x55d95678a600] mmco: unref short failure +[h264 @ 0x55d95678a600] mmco: unref short failure +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x555deda84c00] mmco: unref short failure +[h264 @ 0x555deda84c00] mmco: unref short failure +[h264 @ 0x555deda84c00] mmco: unref short failure +[h264 @ 0x55d956f98f80] mmco: unref short failure +[h264 @ 0x55d956f98f80] mmco: unref short failure +[h264 @ 0x55d956f98f80] mmco: unref short failure + [2024-11-28 07:35:48] iteration 715/ 1000 | consumed samples: 45760 | elapsed time per iteration (ms): 207227.5 | throughput per GPU (TFLOP/s/GPU): 37.2 | learning rate: 1.071683E-06 | global batch size: 64 | lm loss: 6.213148E-01 | loss scale: 1.0 | grad norm: 0.898 | number of skipped iterations: 0 | number of nan iterations: 0 | + [2024-11-28 07:37:11] iteration 716/ 1000 | consumed samples: 45824 | elapsed time per iteration (ms): 82861.5 | throughput per GPU (TFLOP/s/GPU): 93.0 | learning rate: 1.065363E-06 | global batch size: 64 | lm loss: 6.508120E-01 | loss scale: 1.0 | grad norm: 0.845 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x55d957a03080] mmco: unref short failure +[h264 @ 0x55d957a03080] mmco: unref short failure +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x55d957a03080] mmco: unref short failure +[h264 @ 0x55d957a03080] mmco: unref short failure +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x55d957883f80] mmco: unref short failure +[h264 @ 0x55d957883f80] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x55d957a03080] mmco: unref short failure +[h264 @ 0x55d957a03080] mmco: unref short failure +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x55d957a03080] mmco: unref short failure +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x55d957883f80] mmco: unref short failure +[h264 @ 0x55d957883f80] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x55d957883f80] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x55d959b34680] mmco: unref short failure +[h264 @ 0x55d959b34680] mmco: unref short failure + [2024-11-28 07:38:56] iteration 717/ 1000 | consumed samples: 45888 | elapsed time per iteration (ms): 104658.6 | throughput per GPU (TFLOP/s/GPU): 73.7 | learning rate: 1.059059E-06 | global batch size: 64 | lm loss: 6.287351E-01 | loss scale: 1.0 | grad norm: 0.802 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x55d957a91640] mmco: unref short failure +[h264 @ 0x55d957a91640] mmco: unref short failure +[h264 @ 0x555ded63c840] mmco: unref short failure +[h264 @ 0x555ded63c840] mmco: unref short failure +[h264 @ 0x55d957a91640] mmco: unref short failure +[h264 @ 0x55d957a91640] mmco: unref short failure +[h264 @ 0x555ded63c840] mmco: unref short failure +[h264 @ 0x555ded63c840] mmco: unref short failure +[h264 @ 0x555def216fc0] mmco: unref short failure +[h264 @ 0x55d956386f00] mmco: unref short failure +[h264 @ 0x55d957a1cec0] mmco: unref short failure +[h264 @ 0x55d957a1cec0] mmco: unref short failure +[h264 @ 0x555dee0de680] mmco: unref short failure +[h264 @ 0x555dee0de680] mmco: unref short failure +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x55d957a03080] mmco: unref short failure +[h264 @ 0x555ded0cd540] mmco: unref short failure +[h264 @ 0x555ded0cd540] mmco: unref short failure +[h264 @ 0x55d9567ebec0] mmco: unref short failure +[h264 @ 0x55d9567ebec0] mmco: unref short failure +[h264 @ 0x555ded0cd540] mmco: unref short failure +[h264 @ 0x555deda84c00] mmco: unref short failure +[h264 @ 0x555deda84c00] mmco: unref short failure +[h264 @ 0x55d9567ebec0] mmco: unref short failure +[h264 @ 0x555deda84c00] mmco: unref short failure +[h264 @ 0x555deda84c00] mmco: unref short failure +[h264 @ 0x55d956fbbd80] mmco: unref short failure +[h264 @ 0x55d956fbbd80] mmco: unref short failure +[h264 @ 0x55d956fbbd80] mmco: unref short failure +[h264 @ 0x55d956fbbd80] mmco: unref short failure +[h264 @ 0x555dee0de680] mmco: unref short failure +[h264 @ 0x555dee0de680] mmco: unref short failure +[h264 @ 0x55d957a91640] mmco: unref short failure +[h264 @ 0x55d957a91640] mmco: unref short failure +[h264 @ 0x555dee0de680] mmco: unref short failure +[h264 @ 0x555dee0de680] mmco: unref short failure +[h264 @ 0x55d957a91640] mmco: unref short failure +[h264 @ 0x55d957a91640] mmco: unref short failure +[h264 @ 0x555dee0de680] mmco: unref short failure +[h264 @ 0x555dee0de680] mmco: unref short failure +[h264 @ 0x55d957a91640] mmco: unref short failure +[h264 @ 0x55d957a91640] mmco: unref short failure +[h264 @ 0x555dee0de680] mmco: unref short failure +[h264 @ 0x555dee0de680] mmco: unref short failure +[h264 @ 0x55d957a91640] mmco: unref short failure +[h264 @ 0x55d957a91640] mmco: unref short failure + [2024-11-28 07:40:46] iteration 718/ 1000 | consumed samples: 45952 | elapsed time per iteration (ms): 110547.3 | throughput per GPU (TFLOP/s/GPU): 69.7 | learning rate: 1.052770E-06 | global batch size: 64 | lm loss: 6.484494E-01 | loss scale: 1.0 | grad norm: 1.095 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555def216fc0] mmco: unref short failure +[h264 @ 0x555def216fc0] mmco: unref short failure +[h264 @ 0x55d956386f00] mmco: unref short failure +[h264 @ 0x55d956386f00] mmco: unref short failure + [2024-11-28 07:41:57] iteration 719/ 1000 | consumed samples: 46016 | elapsed time per iteration (ms): 71255.5 | throughput per GPU (TFLOP/s/GPU): 108.2 | learning rate: 1.046497E-06 | global batch size: 64 | lm loss: 6.334097E-01 | loss scale: 1.0 | grad norm: 0.807 | number of skipped iterations: 0 | number of nan iterations: 0 | + [2024-11-28 07:43:07] iteration 720/ 1000 | consumed samples: 46080 | elapsed time per iteration (ms): 69978.1 | throughput per GPU (TFLOP/s/GPU): 110.2 | learning rate: 1.040240E-06 | global batch size: 64 | lm loss: 6.164656E-01 | loss scale: 1.0 | grad norm: 1.219 | number of skipped iterations: 0 | number of nan iterations: 0 | + [2024-11-28 07:44:15] iteration 721/ 1000 | consumed samples: 46144 | elapsed time per iteration (ms): 68071.8 | throughput per GPU (TFLOP/s/GPU): 113.2 | learning rate: 1.033999E-06 | global batch size: 64 | lm loss: 6.665082E-01 | loss scale: 1.0 | grad norm: 0.771 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555df1cb0600] mmco: unref short failure +[h264 @ 0x555df1cb0600] mmco: unref short failure +[h264 @ 0x55d955d53340] mmco: unref short failure +[h264 @ 0x55d955d53340] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x55d955d53340] mmco: unref short failure +[h264 @ 0x55d955d53340] mmco: unref short failure + [2024-11-28 07:45:53] iteration 722/ 1000 | consumed samples: 46208 | elapsed time per iteration (ms): 97636.9 | throughput per GPU (TFLOP/s/GPU): 79.0 | learning rate: 1.027773E-06 | global batch size: 64 | lm loss: 7.176014E-01 | loss scale: 1.0 | grad norm: 0.961 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x55d955d53340] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x55d955d53340] mmco: unref short failure +[h264 @ 0x55d955d53340] mmco: unref short failure +[h264 @ 0x555dedb7a200] mmco: unref short failure +[h264 @ 0x55d95707f900] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x55d959b34680] mmco: unref short failure + [2024-11-28 07:47:09] iteration 723/ 1000 | consumed samples: 46272 | elapsed time per iteration (ms): 75952.6 | throughput per GPU (TFLOP/s/GPU): 101.5 | learning rate: 1.021564E-06 | global batch size: 64 | lm loss: 7.123392E-01 | loss scale: 1.0 | grad norm: 1.054 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555dec794000] mmco: unref short failure +[h264 @ 0x555dec794000] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x55d95692cb80] mmco: unref short failure +[h264 @ 0x55d95692cb80] mmco: unref short failure +[h264 @ 0x55d957fbfd40] mmco: unref short failure +[h264 @ 0x55d957fbfd40] mmco: unref short failure +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x55d957a03080] mmco: unref short failure +[h264 @ 0x55d957a03080] mmco: unref short failure +[h264 @ 0x55d957a03080] mmco: unref short failure +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x55d95707f900] mmco: unref short failure +[h264 @ 0x55d95707f900] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x55d95707f900] mmco: unref short failure +[h264 @ 0x55d95707f900] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x55d95707f900] mmco: unref short failure +[h264 @ 0x55d95707f900] mmco: unref short failure +[h264 @ 0x55d95707f900] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x55d95707f900] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x55d95707f900] mmco: unref short failure +[h264 @ 0x55d95707f900] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x55d95707f900] mmco: unref short failure +[h264 @ 0x55d95707f900] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure + [2024-11-28 07:48:34] iteration 724/ 1000 | consumed samples: 46336 | elapsed time per iteration (ms): 85202.5 | throughput per GPU (TFLOP/s/GPU): 90.5 | learning rate: 1.015370E-06 | global batch size: 64 | lm loss: 6.468613E-01 | loss scale: 1.0 | grad norm: 0.700 | number of skipped iterations: 0 | number of nan iterations: 0 | + [2024-11-28 07:50:06] iteration 725/ 1000 | consumed samples: 46400 | elapsed time per iteration (ms): 91867.5 | throughput per GPU (TFLOP/s/GPU): 83.9 | learning rate: 1.009193E-06 | global batch size: 64 | lm loss: 6.567154E-01 | loss scale: 1.0 | grad norm: 0.925 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x55d95707f900] mmco: unref short failure +[h264 @ 0x55d95707f900] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x55d95707f900] mmco: unref short failure + [2024-11-28 07:51:49] iteration 726/ 1000 | consumed samples: 46464 | elapsed time per iteration (ms): 103112.7 | throughput per GPU (TFLOP/s/GPU): 74.8 | learning rate: 1.003032E-06 | global batch size: 64 | lm loss: 6.286631E-01 | loss scale: 1.0 | grad norm: 0.968 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555def216fc0] mmco: unref short failure +[h264 @ 0x55d956042040] mmco: unref short failure +[h264 @ 0x555dec836c00] mmco: unref short failure +[h264 @ 0x555dec836c00] mmco: unref short failure +[h264 @ 0x55d957a1cec0] mmco: unref short failure +[h264 @ 0x55d957a1cec0] mmco: unref short failure +[h264 @ 0x555dec836c00] mmco: unref short failure +[h264 @ 0x55d957a1cec0] mmco: unref short failure +[h264 @ 0x555dec836c00] mmco: unref short failure +[h264 @ 0x555dec836c00] mmco: unref short failure +[h264 @ 0x55d957a1cec0] mmco: unref short failure +[h264 @ 0x55d957a1cec0] mmco: unref short failure +[h264 @ 0x55d955d53340] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure + [2024-11-28 07:53:26] iteration 727/ 1000 | consumed samples: 46528 | elapsed time per iteration (ms): 96307.4 | throughput per GPU (TFLOP/s/GPU): 80.0 | learning rate: 9.968868E-07 | global batch size: 64 | lm loss: 5.987281E-01 | loss scale: 1.0 | grad norm: 0.763 | number of skipped iterations: 0 | number of nan iterations: 0 | + [2024-11-28 07:54:48] iteration 728/ 1000 | consumed samples: 46592 | elapsed time per iteration (ms): 81878.8 | throughput per GPU (TFLOP/s/GPU): 94.1 | learning rate: 9.907581E-07 | global batch size: 64 | lm loss: 6.290482E-01 | loss scale: 1.0 | grad norm: 0.782 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x55d956b4f800] mmco: unref short failure +[h264 @ 0x555dee0f8a80] mmco: unref short failure +[h264 @ 0x55d95b718d00] mmco: unref short failure +[h264 @ 0x55d95b718d00] mmco: unref short failure +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x55d95b718d00] mmco: unref short failure + [2024-11-28 07:56:38] iteration 729/ 1000 | consumed samples: 46656 | elapsed time per iteration (ms): 109994.2 | throughput per GPU (TFLOP/s/GPU): 70.1 | learning rate: 9.846458E-07 | global batch size: 64 | lm loss: 5.959722E-01 | loss scale: 1.0 | grad norm: 0.843 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x55d956b4f800] mmco: unref short failure +[h264 @ 0x555decb36640] mmco: unref short failure +[h264 @ 0x55d956fbbd80] mmco: unref short failure +[h264 @ 0x55d956fbbd80] mmco: unref short failure +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x55d956fbbd80] mmco: unref short failure +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x55d956b4f800] mmco: unref short failure +[h264 @ 0x555decb36640] mmco: unref short failure +[h264 @ 0x55d956b4f800] mmco: unref short failure +[h264 @ 0x555decb36640] mmco: unref short failure +[h264 @ 0x55d956b4f800] mmco: unref short failure +[h264 @ 0x555decb36640] mmco: unref short failure + [2024-11-28 07:58:01] iteration 730/ 1000 | consumed samples: 46720 | elapsed time per iteration (ms): 83517.2 | throughput per GPU (TFLOP/s/GPU): 92.3 | learning rate: 9.785499E-07 | global batch size: 64 | lm loss: 6.537108E-01 | loss scale: 1.0 | grad norm: 0.924 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x55d956fbbd80] mmco: unref short failure +[h264 @ 0x55d956fbbd80] mmco: unref short failure +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x555df333fac0] mmco: unref short failure + [2024-11-28 07:59:13] iteration 731/ 1000 | consumed samples: 46784 | elapsed time per iteration (ms): 71873.9 | throughput per GPU (TFLOP/s/GPU): 107.3 | learning rate: 9.724704E-07 | global batch size: 64 | lm loss: 6.193046E-01 | loss scale: 1.0 | grad norm: 0.769 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555df0f7cd00] mmco: unref short failure +[h264 @ 0x55d956b88f40] mmco: unref short failure +[h264 @ 0x55d956b88f40] mmco: unref short failure +[h264 @ 0x555df0f7cd00] mmco: unref short failure +[h264 @ 0x55d956b88f40] mmco: unref short failure +[h264 @ 0x555df0f7cd00] mmco: unref short failure +[h264 @ 0x55d956b88f40] mmco: unref short failure +[h264 @ 0x55d956b88f40] mmco: unref short failure +[h264 @ 0x555df0f7cd00] mmco: unref short failure +[h264 @ 0x555df0f7cd00] mmco: unref short failure +[h264 @ 0x55d9581bc7c0] mmco: unref short failure +[h264 @ 0x555ded0fd000] mmco: unref short failure +[h264 @ 0x555dec836c00] mmco: unref short failure +[h264 @ 0x555dec836c00] mmco: unref short failure +[h264 @ 0x55d95707f900] mmco: unref short failure +[h264 @ 0x55d95707f900] mmco: unref short failure + [2024-11-28 08:00:44] iteration 732/ 1000 | consumed samples: 46848 | elapsed time per iteration (ms): 91514.7 | throughput per GPU (TFLOP/s/GPU): 84.2 | learning rate: 9.664075E-07 | global batch size: 64 | lm loss: 6.407279E-01 | loss scale: 1.0 | grad norm: 1.185 | number of skipped iterations: 0 | number of nan iterations: 0 | + [2024-11-28 08:02:07] iteration 733/ 1000 | consumed samples: 46912 | elapsed time per iteration (ms): 82064.7 | throughput per GPU (TFLOP/s/GPU): 93.9 | learning rate: 9.603612E-07 | global batch size: 64 | lm loss: 6.269274E-01 | loss scale: 1.0 | grad norm: 0.800 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x55d957a03080] mmco: unref short failure +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x55d957a03080] mmco: unref short failure +[h264 @ 0x55d957a03080] mmco: unref short failure +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x55d957a03080] mmco: unref short failure +[h264 @ 0x55d957a03080] mmco: unref short failure + [2024-11-28 08:03:40] iteration 734/ 1000 | consumed samples: 46976 | elapsed time per iteration (ms): 93712.0 | throughput per GPU (TFLOP/s/GPU): 82.3 | learning rate: 9.543316E-07 | global batch size: 64 | lm loss: 5.805084E-01 | loss scale: 1.0 | grad norm: 0.878 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x55d95b718d00] mmco: unref short failure +[h264 @ 0x55d95b718d00] mmco: unref short failure +[h264 @ 0x555ded0fd000] mmco: unref short failure +[h264 @ 0x55d955cbb4c0] mmco: unref short failure +[h264 @ 0x55d956b88f40] mmco: unref short failure +[h264 @ 0x55d956b88f40] mmco: unref short failure +[h264 @ 0x555dee6ec240] mmco: unref short failure +[h264 @ 0x555dee6ec240] mmco: unref short failure +[h264 @ 0x55d955e4acc0] mmco: unref short failure +[h264 @ 0x555df4961e80] mmco: unref short failure +[h264 @ 0x55d956aae040] mmco: unref short failure +[h264 @ 0x55d956aae040] mmco: unref short failure +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x55d955e4acc0] mmco: unref short failure +[h264 @ 0x555df4961e80] mmco: unref short failure +[h264 @ 0x55d955e4acc0] mmco: unref short failure +[h264 @ 0x55d955e4acc0] mmco: unref short failure +[h264 @ 0x555df4961e80] mmco: unref short failure +[h264 @ 0x555df4961e80] mmco: unref short failure +[h264 @ 0x55d955e4acc0] mmco: unref short failure +[h264 @ 0x55d955e4acc0] mmco: unref short failure +[h264 @ 0x555df4961e80] mmco: unref short failure +[h264 @ 0x555df4961e80] mmco: unref short failure +[h264 @ 0x55d955e4acc0] mmco: unref short failure +[h264 @ 0x555df4961e80] mmco: unref short failure +[h264 @ 0x55d955e4acc0] mmco: unref short failure +[h264 @ 0x55d955e4acc0] mmco: unref short failure +[h264 @ 0x555df4961e80] mmco: unref short failure +[h264 @ 0x555df4961e80] mmco: unref short failure +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x55d956fbbd80] mmco: unref short failure +[h264 @ 0x55d956fbbd80] mmco: unref short failure +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x55d956fbbd80] mmco: unref short failure +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x55d956fbbd80] mmco: unref short failure + [2024-11-28 08:05:36] iteration 735/ 1000 | consumed samples: 47040 | elapsed time per iteration (ms): 116175.3 | throughput per GPU (TFLOP/s/GPU): 66.4 | learning rate: 9.483188E-07 | global batch size: 64 | lm loss: 6.738163E-01 | loss scale: 1.0 | grad norm: 0.907 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x55d955cbb4c0] mmco: unref short failure +[h264 @ 0x55d955cbb4c0] mmco: unref short failure +[h264 @ 0x555ded0fd000] mmco: unref short failure +[h264 @ 0x555ded0fd000] mmco: unref short failure +[h264 @ 0x55d955cbb4c0] mmco: unref short failure +[h264 @ 0x555ded0fd000] mmco: unref short failure + [2024-11-28 08:06:55] iteration 736/ 1000 | consumed samples: 47104 | elapsed time per iteration (ms): 78504.1 | throughput per GPU (TFLOP/s/GPU): 98.2 | learning rate: 9.423227E-07 | global batch size: 64 | lm loss: 6.742063E-01 | loss scale: 1.0 | grad norm: 0.895 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x55d95b718d00] mmco: unref short failure +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x55d95b718d00] mmco: unref short failure +[h264 @ 0x55d95b718d00] mmco: unref short failure +[h264 @ 0x555df333fac0] mmco: unref short failure + [2024-11-28 08:08:16] iteration 737/ 1000 | consumed samples: 47168 | elapsed time per iteration (ms): 80642.9 | throughput per GPU (TFLOP/s/GPU): 95.6 | learning rate: 9.363435E-07 | global batch size: 64 | lm loss: 6.592349E-01 | loss scale: 1.0 | grad norm: 0.917 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x55d95742d280] mmco: unref short failure +[h264 @ 0x55d95742d280] mmco: unref short failure +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x55d956fbbd80] mmco: unref short failure +[h264 @ 0x55d956fbbd80] mmco: unref short failure +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x55d95742d280] mmco: unref short failure +[h264 @ 0x55d95742d280] mmco: unref short failure +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x55d956fbbd80] mmco: unref short failure +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x55d956fbbd80] mmco: unref short failure +[h264 @ 0x55d956fbbd80] mmco: unref short failure +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x55d957883f80] mmco: unref short failure +[h264 @ 0x55d957883f80] mmco: unref short failure +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x55d957883f80] mmco: unref short failure + [2024-11-28 08:09:46] iteration 738/ 1000 | consumed samples: 47232 | elapsed time per iteration (ms): 90072.0 | throughput per GPU (TFLOP/s/GPU): 85.6 | learning rate: 9.303812E-07 | global batch size: 64 | lm loss: 6.529440E-01 | loss scale: 1.0 | grad norm: 0.780 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555dee570b40] mmco: unref short failure +[h264 @ 0x555dee570b40] mmco: unref short failure +[h264 @ 0x55d956b88f40] mmco: unref short failure +[h264 @ 0x55d956b88f40] mmco: unref short failure +[h264 @ 0x55d956b88f40] mmco: unref short failure +[h264 @ 0x555dee570b40] mmco: unref short failure +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x55d956b4f800] mmco: unref short failure +[h264 @ 0x555df4961e80] mmco: unref short failure +[h264 @ 0x555df4961e80] mmco: unref short failure +[h264 @ 0x555df4961e80] mmco: unref short failure +[h264 @ 0x555df4961e80] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x555df4961e80] mmco: unref short failure +[h264 @ 0x55d9560849c0] mmco: unref short failure +[h264 @ 0x555df4961e80] mmco: unref short failure +[h264 @ 0x555df4961e80] mmco: unref short failure +[h264 @ 0x555df4961e80] mmco: unref short failure +[h264 @ 0x555df4961e80] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure + [2024-11-28 08:11:16] iteration 739/ 1000 | consumed samples: 47296 | elapsed time per iteration (ms): 89972.7 | throughput per GPU (TFLOP/s/GPU): 85.7 | learning rate: 9.244359E-07 | global batch size: 64 | lm loss: 6.460364E-01 | loss scale: 1.0 | grad norm: 0.848 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x555df4961e80] mmco: unref short failure +[h264 @ 0x555df4961e80] mmco: unref short failure +[h264 @ 0x55d956fbbd80] mmco: unref short failure +[h264 @ 0x55d956fbbd80] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x55d957fbfd40] mmco: unref short failure +[h264 @ 0x555df4961e80] mmco: unref short failure +[h264 @ 0x555df4961e80] mmco: unref short failure +[h264 @ 0x555df4961e80] mmco: unref short failure +[h264 @ 0x555df4961e80] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure + [2024-11-28 08:12:50] iteration 740/ 1000 | consumed samples: 47360 | elapsed time per iteration (ms): 94367.3 | throughput per GPU (TFLOP/s/GPU): 81.7 | learning rate: 9.185077E-07 | global batch size: 64 | lm loss: 7.104200E-01 | loss scale: 1.0 | grad norm: 3.827 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555dee6b8180] mmco: unref short failure +[h264 @ 0x555dee6b8180] mmco: unref short failure +[h264 @ 0x55d95c76c1c0] mmco: unref short failure +[h264 @ 0x55d95c76c1c0] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x55d957a1cec0] mmco: unref short failure +[h264 @ 0x55d957a1cec0] mmco: unref short failure +[h264 @ 0x55d957a1cec0] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x55d957a1cec0] mmco: unref short failure +[h264 @ 0x55d957a1cec0] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x55d957a1cec0] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x55d957a1cec0] mmco: unref short failure +[h264 @ 0x55d957a1cec0] mmco: unref short failure + [2024-11-28 08:14:11] iteration 741/ 1000 | consumed samples: 47424 | elapsed time per iteration (ms): 81259.1 | throughput per GPU (TFLOP/s/GPU): 94.9 | learning rate: 9.125966E-07 | global batch size: 64 | lm loss: 7.529110E-01 | loss scale: 1.0 | grad norm: 0.903 | number of skipped iterations: 0 | number of nan iterations: 0 | + [2024-11-28 08:15:45] iteration 742/ 1000 | consumed samples: 47488 | elapsed time per iteration (ms): 93219.8 | throughput per GPU (TFLOP/s/GPU): 82.7 | learning rate: 9.067026E-07 | global batch size: 64 | lm loss: 6.652069E-01 | loss scale: 1.0 | grad norm: 1.478 | number of skipped iterations: 0 | number of nan iterations: 0 | + [2024-11-28 08:17:38] iteration 743/ 1000 | consumed samples: 47552 | elapsed time per iteration (ms): 113535.7 | throughput per GPU (TFLOP/s/GPU): 67.9 | learning rate: 9.008259E-07 | global batch size: 64 | lm loss: 6.635025E-01 | loss scale: 1.0 | grad norm: 1.056 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555df4961e80] mmco: unref short failure +[h264 @ 0x555df4961e80] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x55d959cd93c0] mmco: unref short failure +[h264 @ 0x55d959cd93c0] mmco: unref short failure +[h264 @ 0x555dede22280] mmco: unref short failure +[h264 @ 0x55d95c2eb200] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x55d956042040] mmco: unref short failure +[h264 @ 0x55d956042040] mmco: unref short failure +[h264 @ 0x55d95c2eb200] mmco: unref short failure +[h264 @ 0x555dede22280] mmco: unref short failure + [2024-11-28 08:19:14] iteration 744/ 1000 | consumed samples: 47616 | elapsed time per iteration (ms): 95812.0 | throughput per GPU (TFLOP/s/GPU): 80.5 | learning rate: 8.949665E-07 | global batch size: 64 | lm loss: 6.736272E-01 | loss scale: 1.0 | grad norm: 0.842 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x55d9575fd4c0] mmco: unref short failure +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x55d9575fd4c0] mmco: unref short failure +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x55d9575fd4c0] mmco: unref short failure +[h264 @ 0x55d9575fd4c0] mmco: unref short failure +[h264 @ 0x555dedf05880] Missing reference picture, default is 65530 +[h264 @ 0x555dedf05880] Missing reference picture, default is 65530 +[h264 @ 0x555dedf05880] mmco: unref short failure +[h264 @ 0x555dedf05880] mmco: unref short failure +[h264 @ 0x555dedf05880] Missing reference picture, default is 65530 +[h264 @ 0x555dedf05880] Missing reference picture, default is 65530 +[h264 @ 0x555dedf05880] mmco: unref short failure +[h264 @ 0x555dedf05880] mmco: unref short failure +[h264 @ 0x55d959cd93c0] Missing reference picture, default is 65530 +[h264 @ 0x55d959cd93c0] Missing reference picture, default is 65530 +[h264 @ 0x55d959cd93c0] mmco: unref short failure +[h264 @ 0x55d959cd93c0] mmco: unref short failure +[h264 @ 0x55d959cd93c0] Missing reference picture, default is 65530 +[h264 @ 0x55d959cd93c0] Missing reference picture, default is 65530 +[h264 @ 0x55d959cd93c0] mmco: unref short failure +[h264 @ 0x55d959cd93c0] mmco: unref short failure +[h264 @ 0x555dedf05880] mmco: unref short failure +[h264 @ 0x55d959cd93c0] mmco: unref short failure +[h264 @ 0x555dedf05880] mmco: unref short failure +[h264 @ 0x555dedf05880] mmco: unref short failure +[h264 @ 0x55d959cd93c0] mmco: unref short failure +[h264 @ 0x55d959cd93c0] mmco: unref short failure + [2024-11-28 08:20:29] iteration 745/ 1000 | consumed samples: 47680 | elapsed time per iteration (ms): 75295.9 | throughput per GPU (TFLOP/s/GPU): 102.4 | learning rate: 8.891245E-07 | global batch size: 64 | lm loss: 6.412493E-01 | loss scale: 1.0 | grad norm: 1.007 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x55d956fbbd80] mmco: unref short failure +[h264 @ 0x55d956fbbd80] mmco: unref short failure + [2024-11-28 08:21:53] iteration 746/ 1000 | consumed samples: 47744 | elapsed time per iteration (ms): 83674.2 | throughput per GPU (TFLOP/s/GPU): 92.1 | learning rate: 8.832998E-07 | global batch size: 64 | lm loss: 6.288611E-01 | loss scale: 1.0 | grad norm: 0.834 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555df4961e80] mmco: unref short failure +[h264 @ 0x555df4961e80] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x55d95678a600] mmco: unref short failure +[h264 @ 0x555df4961e80] mmco: unref short failure +[h264 @ 0x555df4961e80] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x555df4961e80] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure + [2024-11-28 08:23:13] iteration 747/ 1000 | consumed samples: 47808 | elapsed time per iteration (ms): 79617.4 | throughput per GPU (TFLOP/s/GPU): 96.8 | learning rate: 8.774927E-07 | global batch size: 64 | lm loss: 6.212032E-01 | loss scale: 1.0 | grad norm: 0.787 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555dece9f180] mmco: unref short failure +[h264 @ 0x55d955e4acc0] mmco: unref short failure +[h264 @ 0x55d955e4acc0] mmco: unref short failure +[h264 @ 0x555dece9f180] mmco: unref short failure +[h264 @ 0x55d95c2eb200] mmco: unref short failure +[h264 @ 0x555dede22280] mmco: unref short failure +[h264 @ 0x555dede22280] mmco: unref short failure +[h264 @ 0x555dede22280] mmco: unref short failure +[h264 @ 0x55d95c2eb200] mmco: unref short failure +[h264 @ 0x55d95c2eb200] mmco: unref short failure +[h264 @ 0x555dece9f180] mmco: unref short failure +[h264 @ 0x555dece9f180] mmco: unref short failure +[h264 @ 0x55d956b9c3c0] mmco: unref short failure +[h264 @ 0x55d956b9c3c0] mmco: unref short failure +[h264 @ 0x555df32c1f40] mmco: unref short failure +[h264 @ 0x55d9593709c0] mmco: unref short failure +[h264 @ 0x555df32c1f40] mmco: unref short failure +[h264 @ 0x555df32c1f40] mmco: unref short failure +[h264 @ 0x55d9593709c0] mmco: unref short failure +[h264 @ 0x55d9593709c0] mmco: unref short failure +[h264 @ 0x555df4961e80] mmco: unref short failure +[h264 @ 0x555df4961e80] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x555df32c1f40] mmco: unref short failure +[h264 @ 0x555df32c1f40] mmco: unref short failure +[h264 @ 0x55d9593709c0] mmco: unref short failure +[h264 @ 0x55d9593709c0] mmco: unref short failure +[h264 @ 0x555df32c1f40] mmco: unref short failure +[h264 @ 0x555df32c1f40] mmco: unref short failure +[h264 @ 0x55d9593709c0] mmco: unref short failure +[h264 @ 0x55d9593709c0] mmco: unref short failure + [2024-11-28 08:24:28] iteration 748/ 1000 | consumed samples: 47872 | elapsed time per iteration (ms): 75213.3 | throughput per GPU (TFLOP/s/GPU): 102.5 | learning rate: 8.717031E-07 | global batch size: 64 | lm loss: 6.660864E-01 | loss scale: 1.0 | grad norm: 0.858 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x55d956da5240] mmco: unref short failure +[h264 @ 0x55d956da5240] mmco: unref short failure +[h264 @ 0x555dedb7a200] mmco: unref short failure +[h264 @ 0x555dedb7a200] mmco: unref short failure +[h264 @ 0x55d956da5240] mmco: unref short failure +[h264 @ 0x555dedb7a200] mmco: unref short failure +[h264 @ 0x555dece9f180] mmco: unref short failure +[h264 @ 0x55d956da5240] mmco: unref short failure +[h264 @ 0x55d956da5240] mmco: unref short failure +[h264 @ 0x55d957a91640] mmco: unref short failure +[h264 @ 0x555dedb7a200] mmco: unref short failure +[h264 @ 0x555dedb7a200] mmco: unref short failure +[h264 @ 0x55d956da5240] mmco: unref short failure +[h264 @ 0x55d956da5240] mmco: unref short failure +[h264 @ 0x555dedb7a200] mmco: unref short failure +[h264 @ 0x555dedb7a200] mmco: unref short failure +[h264 @ 0x555dece9f180] mmco: unref short failure +[h264 @ 0x555dece9f180] mmco: unref short failure +[h264 @ 0x55d957a91640] mmco: unref short failure +[h264 @ 0x55d957a91640] mmco: unref short failure +[h264 @ 0x555dece9f180] mmco: unref short failure +[h264 @ 0x555dece9f180] mmco: unref short failure +[h264 @ 0x55d957a91640] mmco: unref short failure +[h264 @ 0x55d957a91640] mmco: unref short failure +[h264 @ 0x555dece9f180] mmco: unref short failure +[h264 @ 0x55d957a91640] mmco: unref short failure +[h264 @ 0x555dece9f180] mmco: unref short failure +[h264 @ 0x555dece9f180] mmco: unref short failure +[h264 @ 0x55d957a91640] mmco: unref short failure +[h264 @ 0x55d957a91640] mmco: unref short failure +[h264 @ 0x55d956da5240] mmco: unref short failure +[h264 @ 0x555dedb7a200] mmco: unref short failure +[h264 @ 0x55d956da5240] mmco: unref short failure +[h264 @ 0x555dedb7a200] mmco: unref short failure + [2024-11-28 08:26:48] iteration 749/ 1000 | consumed samples: 47936 | elapsed time per iteration (ms): 140548.6 | throughput per GPU (TFLOP/s/GPU): 54.8 | learning rate: 8.659311E-07 | global batch size: 64 | lm loss: 7.050755E-01 | loss scale: 1.0 | grad norm: 0.807 | number of skipped iterations: 0 | number of nan iterations: 0 | + [2024-11-28 08:28:15] iteration 750/ 1000 | consumed samples: 48000 | elapsed time per iteration (ms): 86971.5 | throughput per GPU (TFLOP/s/GPU): 88.6 | learning rate: 8.601767E-07 | global batch size: 64 | lm loss: 6.688051E-01 | loss scale: 1.0 | grad norm: 0.802 | number of skipped iterations: 0 | number of nan iterations: 0 | + [2024-11-28 08:29:58] iteration 751/ 1000 | consumed samples: 48064 | elapsed time per iteration (ms): 102811.2 | throughput per GPU (TFLOP/s/GPU): 75.0 | learning rate: 8.544401E-07 | global batch size: 64 | lm loss: 6.897563E-01 | loss scale: 1.0 | grad norm: 0.980 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x55d9586ad200] mmco: unref short failure +[h264 @ 0x555dedfc6580] mmco: unref short failure +[h264 @ 0x55d9586ad200] mmco: unref short failure +[h264 @ 0x555dedfc6580] mmco: unref short failure +[h264 @ 0x55d9586ad200] mmco: unref short failure +[h264 @ 0x55d9586ad200] mmco: unref short failure +[h264 @ 0x555dedfc6580] mmco: unref short failure +[h264 @ 0x555dedfc6580] mmco: unref short failure +[h264 @ 0x55d9586ad200] mmco: unref short failure +[h264 @ 0x55d9586ad200] mmco: unref short failure +[h264 @ 0x555dedfc6580] mmco: unref short failure +[h264 @ 0x555dedfc6580] mmco: unref short failure +[h264 @ 0x55d9586ad200] mmco: unref short failure +[h264 @ 0x55d9586ad200] mmco: unref short failure +[h264 @ 0x555dedfc6580] mmco: unref short failure +[h264 @ 0x555dedfc6580] mmco: unref short failure +[h264 @ 0x55d9574d90c0] mmco: unref short failure +[h264 @ 0x555df32c1f40] mmco: unref short failure +[h264 @ 0x55d9586ad200] mmco: unref short failure +[h264 @ 0x55d9586ad200] mmco: unref short failure +[h264 @ 0x555dede22280] mmco: unref short failure +[h264 @ 0x555dede22280] mmco: unref short failure +[h264 @ 0x55d9586ad200] mmco: unref short failure +[h264 @ 0x55d9586ad200] mmco: unref short failure +[h264 @ 0x555dedfc6580] mmco: unref short failure +[h264 @ 0x555dedfc6580] mmco: unref short failure +[h264 @ 0x555dedfc6580] mmco: unref short failure +[h264 @ 0x555dedfc6580] mmco: unref short failure +[h264 @ 0x55d956f98f80] mmco: unref short failure +[h264 @ 0x55d956f98f80] mmco: unref short failure +[h264 @ 0x555dede22280] mmco: unref short failure +[h264 @ 0x555dede22280] mmco: unref short failure +[h264 @ 0x55d9586ad200] mmco: unref short failure +[h264 @ 0x55d9586ad200] mmco: unref short failure +[h264 @ 0x555df32c1f40] mmco: unref short failure +[h264 @ 0x55d9574d90c0] mmco: unref short failure +[h264 @ 0x555df4961e80] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x555df4961e80] mmco: unref short failure +[h264 @ 0x555df4961e80] mmco: unref short failure +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x555df32c1f40] mmco: unref short failure +[h264 @ 0x555df32c1f40] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x555df4961e80] mmco: unref short failure +[h264 @ 0x555df4961e80] mmco: unref short failure +[h264 @ 0x55d9574d90c0] mmco: unref short failure +[h264 @ 0x55d9574d90c0] mmco: unref short failure +[h264 @ 0x55d956fbbd80] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x555df4961e80] mmco: unref short failure +[h264 @ 0x555df4961e80] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure + [2024-11-28 08:31:19] iteration 752/ 1000 | consumed samples: 48128 | elapsed time per iteration (ms): 80807.5 | throughput per GPU (TFLOP/s/GPU): 95.4 | learning rate: 8.487213E-07 | global batch size: 64 | lm loss: 6.634867E-01 | loss scale: 1.0 | grad norm: 1.802 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x555deee52400] mmco: unref short failure +[h264 @ 0x555dee700e80] mmco: unref short failure +[h264 @ 0x55d955e4acc0] mmco: unref short failure +[h264 @ 0x555dee700e80] mmco: unref short failure +[h264 @ 0x55d955e4acc0] mmco: unref short failure +[h264 @ 0x555decea5e80] mmco: unref short failure +[h264 @ 0x55d9568d0900] mmco: unref short failure + [2024-11-28 08:32:48] iteration 753/ 1000 | consumed samples: 48192 | elapsed time per iteration (ms): 89405.0 | throughput per GPU (TFLOP/s/GPU): 86.2 | learning rate: 8.430203E-07 | global batch size: 64 | lm loss: 7.193509E-01 | loss scale: 1.0 | grad norm: 1.011 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555df32c1f40] mmco: unref short failure +[h264 @ 0x555df32c1f40] mmco: unref short failure +[h264 @ 0x55d9574d90c0] mmco: unref short failure +[h264 @ 0x55d9574d90c0] mmco: unref short failure +[h264 @ 0x555dede22280] mmco: unref short failure +[h264 @ 0x555dede22280] mmco: unref short failure +[h264 @ 0x555dede22280] mmco: unref short failure +[h264 @ 0x555dede22280] mmco: unref short failure +[h264 @ 0x55d9567ebec0] mmco: unref short failure +[h264 @ 0x55d9567ebec0] mmco: unref short failure +[h264 @ 0x555dede22280] mmco: unref short failure +[h264 @ 0x555dede22280] mmco: unref short failure +[h264 @ 0x55d9567ebec0] mmco: unref short failure +[h264 @ 0x55d9567ebec0] mmco: unref short failure +[h264 @ 0x555df32c1f40] mmco: unref short failure +[h264 @ 0x55d9574d90c0] mmco: unref short failure +[h264 @ 0x55d9567ebec0] mmco: unref short failure +[h264 @ 0x55d9567ebec0] mmco: unref short failure +[h264 @ 0x555dede22280] mmco: unref short failure +[h264 @ 0x55d9567ebec0] mmco: unref short failure +[h264 @ 0x555dede22280] mmco: unref short failure +[h264 @ 0x555dede22280] mmco: unref short failure +[h264 @ 0x55d9567ebec0] mmco: unref short failure +[h264 @ 0x55d9567ebec0] mmco: unref short failure + [2024-11-28 08:35:03] iteration 754/ 1000 | consumed samples: 48256 | elapsed time per iteration (ms): 134288.4 | throughput per GPU (TFLOP/s/GPU): 57.4 | learning rate: 8.373373E-07 | global batch size: 64 | lm loss: 7.454703E-01 | loss scale: 1.0 | grad norm: 0.835 | number of skipped iterations: 0 | number of nan iterations: 0 | + [2024-11-28 08:36:27] iteration 755/ 1000 | consumed samples: 48320 | elapsed time per iteration (ms): 84129.7 | throughput per GPU (TFLOP/s/GPU): 91.6 | learning rate: 8.316722E-07 | global batch size: 64 | lm loss: 6.674823E-01 | loss scale: 1.0 | grad norm: 0.780 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555deee52400] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x555deee52400] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x555dedfc6580] mmco: unref short failure +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x55d956fbbd80] mmco: unref short failure +[h264 @ 0x55d956f98f80] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x555deee52400] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x555deee52400] mmco: unref short failure + [2024-11-28 08:38:34] iteration 756/ 1000 | consumed samples: 48384 | elapsed time per iteration (ms): 127023.5 | throughput per GPU (TFLOP/s/GPU): 60.7 | learning rate: 8.260251E-07 | global batch size: 64 | lm loss: 6.470302E-01 | loss scale: 1.0 | grad norm: 1.098 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x555df4961e80] mmco: unref short failure +[h264 @ 0x555df4961e80] mmco: unref short failure +[h264 @ 0x55d956f98f80] mmco: unref short failure +[h264 @ 0x555dedfc6580] mmco: unref short failure +[h264 @ 0x55d956f98f80] mmco: unref short failure +[h264 @ 0x555dedfc6580] mmco: unref short failure +[h264 @ 0x55d9581bc7c0] mmco: unref short failure +[h264 @ 0x55d9581bc7c0] mmco: unref short failure +[h264 @ 0x55d9581bc7c0] mmco: unref short failure +[h264 @ 0x555dee6ec240] mmco: unref short failure +[h264 @ 0x555dee6ec240] mmco: unref short failure +[h264 @ 0x555dee6ec240] mmco: unref short failure +[h264 @ 0x55d9581bc7c0] mmco: unref short failure +[h264 @ 0x555dee6ec240] mmco: unref short failure +[h264 @ 0x555dede22280] mmco: unref short failure +[h264 @ 0x55d9586ad200] mmco: unref short failure + [2024-11-28 08:40:07] iteration 757/ 1000 | consumed samples: 48448 | elapsed time per iteration (ms): 92857.7 | throughput per GPU (TFLOP/s/GPU): 83.0 | learning rate: 8.203961E-07 | global batch size: 64 | lm loss: 6.956238E-01 | loss scale: 1.0 | grad norm: 0.936 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555dede22280] mmco: unref short failure +[h264 @ 0x55d956b4f800] mmco: unref short failure +[h264 @ 0x555dede22280] mmco: unref short failure +[h264 @ 0x55d956b4f800] mmco: unref short failure +[h264 @ 0x555dede22280] mmco: unref short failure +[h264 @ 0x55d956b4f800] mmco: unref short failure +[h264 @ 0x555dede22280] mmco: unref short failure +[h264 @ 0x555dede22280] mmco: unref short failure +[h264 @ 0x55d956b4f800] mmco: unref short failure +[h264 @ 0x55d956b4f800] mmco: unref short failure +[h264 @ 0x555dedfc6580] mmco: unref short failure +[h264 @ 0x55d956f98f80] mmco: unref short failure +[h264 @ 0x555dedfc6580] mmco: unref short failure +[h264 @ 0x555dedfc6580] mmco: unref short failure +[h264 @ 0x555dedfc6580] mmco: unref short failure +[h264 @ 0x55d956f98f80] mmco: unref short failure +[h264 @ 0x55d956f98f80] mmco: unref short failure +[h264 @ 0x55d956f98f80] mmco: unref short failure +[h264 @ 0x555dede22280] mmco: unref short failure +[h264 @ 0x555dede22280] mmco: unref short failure +[h264 @ 0x555dede22280] mmco: unref short failure +[h264 @ 0x55d956b4f800] mmco: unref short failure +[h264 @ 0x55d956b4f800] mmco: unref short failure +[h264 @ 0x55d956b4f800] mmco: unref short failure +[h264 @ 0x555deee52400] mmco: unref short failure +[h264 @ 0x555deee52400] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure + [2024-11-28 08:41:32] iteration 758/ 1000 | consumed samples: 48512 | elapsed time per iteration (ms): 85650.1 | throughput per GPU (TFLOP/s/GPU): 90.0 | learning rate: 8.147852E-07 | global batch size: 64 | lm loss: 6.154037E-01 | loss scale: 1.0 | grad norm: 0.838 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x55d9581bc7c0] mmco: unref short failure +[h264 @ 0x555dee6ec240] mmco: unref short failure +[h264 @ 0x555dee6ec240] mmco: unref short failure +[h264 @ 0x55d9581bc7c0] mmco: unref short failure + [2024-11-28 08:42:45] iteration 759/ 1000 | consumed samples: 48576 | elapsed time per iteration (ms): 72871.9 | throughput per GPU (TFLOP/s/GPU): 105.8 | learning rate: 8.091926E-07 | global batch size: 64 | lm loss: 6.568743E-01 | loss scale: 1.0 | grad norm: 1.079 | number of skipped iterations: 0 | number of nan iterations: 0 | +Token indices sequence length is longer than the specified maximum sequence length for this model (137911 > 131072). Running this sequence through the model will result in indexing errors +Token indices sequence length is longer than the specified maximum sequence length for this model (137911 > 131072). Running this sequence through the model will result in indexing errors +[h264 @ 0x55d9581bc7c0] mmco: unref short failure +[h264 @ 0x555dee6ec240] mmco: unref short failure +[h264 @ 0x55d9581bc7c0] mmco: unref short failure +[h264 @ 0x55d9581bc7c0] mmco: unref short failure +[h264 @ 0x555dee6ec240] mmco: unref short failure +[h264 @ 0x555dee6ec240] mmco: unref short failure +[h264 @ 0x55d9581bc7c0] mmco: unref short failure +[h264 @ 0x55d9581bc7c0] mmco: unref short failure +[h264 @ 0x555dee6ec240] mmco: unref short failure +[h264 @ 0x555dee6ec240] mmco: unref short failure +[h264 @ 0x55d955cbb4c0] mmco: unref short failure +[h264 @ 0x555ded0b5480] mmco: unref short failure +[h264 @ 0x55d956042040] mmco: unref short failure +[h264 @ 0x55d956042040] mmco: unref short failure +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x55d9581bc7c0] mmco: unref short failure +[h264 @ 0x555dee6ec240] mmco: unref short failure +[h264 @ 0x55d9581bc7c0] mmco: unref short failure +[h264 @ 0x555dee6ec240] mmco: unref short failure +[h264 @ 0x55d9581bc7c0] mmco: unref short failure +[h264 @ 0x555dee6ec240] mmco: unref short failure +[h264 @ 0x55d955d4f640] mmco: unref short failure +[h264 @ 0x55d955d4f640] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x55d955d4f640] mmco: unref short failure +[h264 @ 0x55d955d4f640] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure + [2024-11-28 08:44:09] iteration 760/ 1000 | consumed samples: 48640 | elapsed time per iteration (ms): 83541.6 | throughput per GPU (TFLOP/s/GPU): 92.3 | learning rate: 8.036182E-07 | global batch size: 64 | lm loss: 6.524685E-01 | loss scale: 1.0 | grad norm: 0.898 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555dee6ec240] mmco: unref short failure +[h264 @ 0x55d956defa80] mmco: unref short failure +[h264 @ 0x555def9a5ac0] mmco: unref short failure +[h264 @ 0x555def9a5ac0] mmco: unref short failure +[h264 @ 0x55d956aae040] mmco: unref short failure +[h264 @ 0x55d956aae040] mmco: unref short failure +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x55d956fbbd80] mmco: unref short failure +[h264 @ 0x55d956fbbd80] mmco: unref short failure + [2024-11-28 08:45:32] iteration 761/ 1000 | consumed samples: 48704 | elapsed time per iteration (ms): 83282.4 | throughput per GPU (TFLOP/s/GPU): 92.6 | learning rate: 7.980621E-07 | global batch size: 64 | lm loss: 7.427295E-01 | loss scale: 1.0 | grad norm: 1.038 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x55d956fbbd80] mmco: unref short failure +[h264 @ 0x55d956fbbd80] mmco: unref short failure +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x55d956fbbd80] mmco: unref short failure +[h264 @ 0x55d956fbbd80] mmco: unref short failure +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x55d956fbbd80] mmco: unref short failure +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x55d956fbbd80] mmco: unref short failure +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x55d956fbbd80] mmco: unref short failure +[h264 @ 0x55d956fbbd80] mmco: unref short failure +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x55d956fbbd80] mmco: unref short failure +[h264 @ 0x55d956fbbd80] mmco: unref short failure + [2024-11-28 08:47:10] iteration 762/ 1000 | consumed samples: 48768 | elapsed time per iteration (ms): 97565.2 | throughput per GPU (TFLOP/s/GPU): 79.0 | learning rate: 7.925244E-07 | global batch size: 64 | lm loss: 6.557251E-01 | loss scale: 1.0 | grad norm: 1.083 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x55d956767080] mmco: unref short failure +[h264 @ 0x555dec836c00] mmco: unref short failure +[h264 @ 0x55d956767080] mmco: unref short failure +[h264 @ 0x55d956767080] mmco: unref short failure +[h264 @ 0x555dec836c00] mmco: unref short failure +[h264 @ 0x555dec836c00] mmco: unref short failure +[h264 @ 0x55d956f98f80] mmco: unref short failure +[h264 @ 0x55d956f98f80] mmco: unref short failure +[h264 @ 0x555dedfc6580] mmco: unref short failure +[h264 @ 0x555dedfc6580] mmco: unref short failure +[h264 @ 0x555dec836c00] mmco: unref short failure +[h264 @ 0x555dec836c00] mmco: unref short failure +[h264 @ 0x55d956767080] mmco: unref short failure +[h264 @ 0x55d956767080] mmco: unref short failure +[h264 @ 0x55d956f98f80] mmco: unref short failure +[h264 @ 0x55d956f98f80] mmco: unref short failure +[h264 @ 0x555dedfc6580] mmco: unref short failure +[h264 @ 0x555dedfc6580] mmco: unref short failure + [2024-11-28 08:49:03] iteration 763/ 1000 | consumed samples: 48832 | elapsed time per iteration (ms): 112877.4 | throughput per GPU (TFLOP/s/GPU): 68.3 | learning rate: 7.870051E-07 | global batch size: 64 | lm loss: 7.390345E-01 | loss scale: 1.0 | grad norm: 0.930 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x55d9569edbc0] mmco: unref short failure +[h264 @ 0x55d9569edbc0] mmco: unref short failure +[h264 @ 0x555dee792700] mmco: unref short failure +[h264 @ 0x555dee792700] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x55d955d4f640] mmco: unref short failure +[h264 @ 0x55d955d4f640] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x55d955d4f640] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x55d955d4f640] mmco: unref short failure +[h264 @ 0x55d955d4f640] mmco: unref short failure +[h264 @ 0x55d956f36e40] mmco: unref short failure +[h264 @ 0x55d956f36e40] mmco: unref short failure +[h264 @ 0x555dedf05880] mmco: unref short failure +[h264 @ 0x555dedf05880] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x55d956f36e40] mmco: unref short failure +[h264 @ 0x555dedf05880] mmco: unref short failure +[h264 @ 0x55d955d4f640] mmco: unref short failure +[h264 @ 0x55d955d4f640] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x55d955d4f640] mmco: unref short failure +[h264 @ 0x55d955d4f640] mmco: unref short failure +[h264 @ 0x55d956f36e40] mmco: unref short failure +[h264 @ 0x55d956f36e40] mmco: unref short failure +[h264 @ 0x555dedf05880] mmco: unref short failure +[h264 @ 0x555dedf05880] mmco: unref short failure +[h264 @ 0x55d956f36e40] mmco: unref short failure +[h264 @ 0x55d956f36e40] mmco: unref short failure +[h264 @ 0x555dedf05880] mmco: unref short failure +[h264 @ 0x555dedf05880] mmco: unref short failure +[h264 @ 0x55d956f36e40] mmco: unref short failure +[h264 @ 0x55d956f36e40] mmco: unref short failure +[h264 @ 0x555dedf05880] mmco: unref short failure +[h264 @ 0x555dedf05880] mmco: unref short failure +[h264 @ 0x55d956f36e40] mmco: unref short failure +[h264 @ 0x555dedf05880] mmco: unref short failure + [2024-11-28 08:50:22] iteration 764/ 1000 | consumed samples: 48896 | elapsed time per iteration (ms): 79888.6 | throughput per GPU (TFLOP/s/GPU): 96.5 | learning rate: 7.815044E-07 | global batch size: 64 | lm loss: 5.950512E-01 | loss scale: 1.0 | grad norm: 0.851 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x55d955e4acc0] mmco: unref short failure +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x55d955e4acc0] mmco: unref short failure +[h264 @ 0x55d955e4acc0] mmco: unref short failure +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x55d955e4acc0] mmco: unref short failure +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x55d955e4acc0] mmco: unref short failure +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x55d955e4acc0] mmco: unref short failure +[h264 @ 0x55d955e4acc0] mmco: unref short failure +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x555df333fac0] mmco: unref short failure + [2024-11-28 08:52:05] iteration 765/ 1000 | consumed samples: 48960 | elapsed time per iteration (ms): 102480.9 | throughput per GPU (TFLOP/s/GPU): 75.2 | learning rate: 7.760222E-07 | global batch size: 64 | lm loss: 6.820820E-01 | loss scale: 1.0 | grad norm: 1.007 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x55d95b718d00] mmco: unref short failure +[h264 @ 0x555dedbd1740] mmco: unref short failure +[h264 @ 0x555dedbd1740] mmco: unref short failure +[h264 @ 0x55d95707f900] mmco: unref short failure +[h264 @ 0x55d95707f900] mmco: unref short failure +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x55d95b718d00] mmco: unref short failure +[h264 @ 0x555dedbd1740] mmco: unref short failure +[h264 @ 0x555dedbd1740] mmco: unref short failure +[h264 @ 0x55d95707f900] mmco: unref short failure +[h264 @ 0x55d95707f900] mmco: unref short failure + [2024-11-28 08:53:24] iteration 766/ 1000 | consumed samples: 49024 | elapsed time per iteration (ms): 79224.7 | throughput per GPU (TFLOP/s/GPU): 97.3 | learning rate: 7.705586E-07 | global batch size: 64 | lm loss: 7.155175E-01 | loss scale: 1.0 | grad norm: 1.681 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555dedfc6580] mmco: unref short failure +[h264 @ 0x555dedfc6580] mmco: unref short failure +[h264 @ 0x55d956f98f80] mmco: unref short failure +[h264 @ 0x55d956f98f80] mmco: unref short failure +[h264 @ 0x55d955d4f640] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure + [2024-11-28 08:54:44] iteration 767/ 1000 | consumed samples: 49088 | elapsed time per iteration (ms): 80096.9 | throughput per GPU (TFLOP/s/GPU): 96.2 | learning rate: 7.651136E-07 | global batch size: 64 | lm loss: 7.093288E-01 | loss scale: 1.0 | grad norm: 1.001 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x55d9567ebec0] mmco: unref short failure +[h264 @ 0x555deee52400] mmco: unref short failure +[h264 @ 0x55d95b718d00] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x55d955d4f640] mmco: unref short failure +[h264 @ 0x55d95b718d00] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x55d95b718d00] mmco: unref short failure +[h264 @ 0x55d95b718d00] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x55d95b718d00] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x55d95b718d00] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x555dece9f180] mmco: unref short failure +[h264 @ 0x55d956da5240] mmco: unref short failure +[h264 @ 0x55d956fbbd80] mmco: unref short failure +[h264 @ 0x55d956fbbd80] mmco: unref short failure +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x55d95b718d00] mmco: unref short failure +[h264 @ 0x55d95b718d00] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x55d95b718d00] mmco: unref short failure +[h264 @ 0x55d95b718d00] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x55d955d4f640] mmco: unref short failure +[h264 @ 0x55d955d4f640] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x55d955d4f640] mmco: unref short failure +[h264 @ 0x55d955d4f640] mmco: unref short failure +[h264 @ 0x555dedbd1740] mmco: unref short failure +[h264 @ 0x555dedbd1740] mmco: unref short failure +[h264 @ 0x55d95707f900] mmco: unref short failure +[h264 @ 0x55d95707f900] mmco: unref short failure +[h264 @ 0x555deee52400] mmco: unref short failure +[h264 @ 0x555deee52400] mmco: unref short failure +[h264 @ 0x55d9586ad200] mmco: unref short failure +[h264 @ 0x55d9586ad200] mmco: unref short failure + [2024-11-28 08:57:45] iteration 768/ 1000 | consumed samples: 49152 | elapsed time per iteration (ms): 180236.9 | throughput per GPU (TFLOP/s/GPU): 42.8 | learning rate: 7.596874E-07 | global batch size: 64 | lm loss: 6.092068E-01 | loss scale: 1.0 | grad norm: 0.760 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555dee570b40] mmco: unref short failure +[h264 @ 0x555dee570b40] mmco: unref short failure +[h264 @ 0x55d95a292180] mmco: unref short failure +[h264 @ 0x55d95a292180] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x55d955d4f640] mmco: unref short failure +[h264 @ 0x55d955d4f640] mmco: unref short failure +[h264 @ 0x555decb36640] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x55d95b718d00] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x555decb36640] mmco: unref short failure +[h264 @ 0x55d95b718d00] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x55d95b718d00] mmco: unref short failure +[h264 @ 0x55d95b718d00] mmco: unref short failure +[h264 @ 0x55d9574d90c0] mmco: unref short failure +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x55d95b718d00] mmco: unref short failure +[h264 @ 0x55d95b718d00] mmco: unref short failure + [2024-11-28 08:59:23] iteration 769/ 1000 | consumed samples: 49216 | elapsed time per iteration (ms): 98856.7 | throughput per GPU (TFLOP/s/GPU): 78.0 | learning rate: 7.542799E-07 | global batch size: 64 | lm loss: 6.536492E-01 | loss scale: 1.0 | grad norm: 0.797 | number of skipped iterations: 0 | number of nan iterations: 0 | + [2024-11-28 09:01:05] iteration 770/ 1000 | consumed samples: 49280 | elapsed time per iteration (ms): 101480.6 | throughput per GPU (TFLOP/s/GPU): 76.0 | learning rate: 7.488913E-07 | global batch size: 64 | lm loss: 6.581574E-01 | loss scale: 1.0 | grad norm: 0.962 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x55d955d4f640] mmco: unref short failure +[h264 @ 0x55d955d4f640] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x555ded7e1040] mmco: unref short failure +[h264 @ 0x555ded7e1040] mmco: unref short failure +[h264 @ 0x55d956b9c3c0] mmco: unref short failure +[h264 @ 0x55d956b9c3c0] mmco: unref short failure +[h264 @ 0x555ded7e1040] mmco: unref short failure +[h264 @ 0x55d956b9c3c0] mmco: unref short failure +[h264 @ 0x555ded7e1040] mmco: unref short failure +[h264 @ 0x555ded7e1040] mmco: unref short failure +[h264 @ 0x55d956b9c3c0] mmco: unref short failure +[h264 @ 0x55d956b9c3c0] mmco: unref short failure +[h264 @ 0x555ded7e1040] mmco: unref short failure +[h264 @ 0x555ded7e1040] mmco: unref short failure +[h264 @ 0x55d956b9c3c0] mmco: unref short failure +[h264 @ 0x55d956b9c3c0] mmco: unref short failure +[h264 @ 0x55d955d4f640] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x555ded7e1040] mmco: unref short failure +[h264 @ 0x555ded7e1040] mmco: unref short failure +[h264 @ 0x55d956b9c3c0] mmco: unref short failure +[h264 @ 0x55d956b9c3c0] mmco: unref short failure +[h264 @ 0x55d955d4f640] mmco: unref short failure +[h264 @ 0x55d955d4f640] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x555ded7e1040] mmco: unref short failure +[h264 @ 0x55d956b9c3c0] mmco: unref short failure +[h264 @ 0x55d956b9c3c0] mmco: unref short failure +[h264 @ 0x555ded7e1040] mmco: unref short failure +[h264 @ 0x55d956b9c3c0] mmco: unref short failure +[h264 @ 0x55d956b9c3c0] mmco: unref short failure +[h264 @ 0x555ded7e1040] mmco: unref short failure +[h264 @ 0x555ded7e1040] mmco: unref short failure +[h264 @ 0x55d956b9c3c0] mmco: unref short failure +[h264 @ 0x55d956b9c3c0] mmco: unref short failure +[h264 @ 0x555ded7e1040] mmco: unref short failure +[h264 @ 0x555ded7e1040] mmco: unref short failure + [2024-11-28 09:02:15] iteration 771/ 1000 | consumed samples: 49344 | elapsed time per iteration (ms): 70333.8 | throughput per GPU (TFLOP/s/GPU): 109.6 | learning rate: 7.435216E-07 | global batch size: 64 | lm loss: 6.496270E-01 | loss scale: 1.0 | grad norm: 0.826 | number of skipped iterations: 0 | number of nan iterations: 0 | + [2024-11-28 09:03:58] iteration 772/ 1000 | consumed samples: 49408 | elapsed time per iteration (ms): 102719.8 | throughput per GPU (TFLOP/s/GPU): 75.0 | learning rate: 7.381709E-07 | global batch size: 64 | lm loss: 6.505972E-01 | loss scale: 1.0 | grad norm: 0.976 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555dedbd1740] mmco: unref short failure +[h264 @ 0x55d95707f900] mmco: unref short failure +[h264 @ 0x555dec9973c0] [h264 @ 0x55d9574d90c0] mmco: unref short failure +mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x55d95b718d00] mmco: unref short failure +[h264 @ 0x555deee52400] mmco: unref short failure +[h264 @ 0x55d9586ad200] mmco: unref short failure +[h264 @ 0x555deee52400] mmco: unref short failure +[h264 @ 0x555deee52400] mmco: unref short failure +[h264 @ 0x55d9586ad200] mmco: unref short failure +[h264 @ 0x55d9586ad200] mmco: unref short failure +[h264 @ 0x555deee52400] mmco: unref short failure +[h264 @ 0x55d9586ad200] mmco: unref short failure +[h264 @ 0x555dece9f180] mmco: unref short failure +[h264 @ 0x55d9560849c0] mmco: unref short failure +[h264 @ 0x555dece9f180] mmco: unref short failure +[h264 @ 0x555dece9f180] mmco: unref short failure +[h264 @ 0x55d9560849c0] mmco: unref short failure +[h264 @ 0x55d9560849c0] mmco: unref short failure +[h264 @ 0x555dece9f180] mmco: unref short failure +[h264 @ 0x555dece9f180] mmco: unref short failure +[h264 @ 0x55d9560849c0] mmco: unref short failure +[h264 @ 0x55d9560849c0] mmco: unref short failure +[h264 @ 0x555dece9f180] mmco: unref short failure +[h264 @ 0x55d9560849c0] mmco: unref short failure + [2024-11-28 09:05:42] iteration 773/ 1000 | consumed samples: 49472 | elapsed time per iteration (ms): 104214.6 | throughput per GPU (TFLOP/s/GPU): 74.0 | learning rate: 7.328391E-07 | global batch size: 64 | lm loss: 6.121303E-01 | loss scale: 1.0 | grad norm: 0.892 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555deee52400] mmco: unref short failure +[h264 @ 0x55d9586ad200] mmco: unref short failure +[h264 @ 0x555deee52400] mmco: unref short failure +[h264 @ 0x555deee52400] mmco: unref short failure +[h264 @ 0x55d9586ad200] mmco: unref short failure +[h264 @ 0x55d9586ad200] mmco: unref short failure +[h264 @ 0x555dece9f180] mmco: unref short failure +[h264 @ 0x55d95873d100] mmco: unref short failure +[h264 @ 0x555dece9f180] mmco: unref short failure +[h264 @ 0x555dece9f180] mmco: unref short failure +[h264 @ 0x55d95873d100] mmco: unref short failure +[h264 @ 0x55d95873d100] mmco: unref short failure +[h264 @ 0x555dece9f180] mmco: unref short failure +[h264 @ 0x555dece9f180] mmco: unref short failure +[h264 @ 0x555dece9f180] mmco: unref short failure +[h264 @ 0x55d9560849c0] mmco: unref short failure +[h264 @ 0x55d9560849c0] mmco: unref short failure +[h264 @ 0x55d9560849c0] mmco: unref short failure +[h264 @ 0x555dece9f180] mmco: unref short failure +[h264 @ 0x555dece9f180] mmco: unref short failure +[h264 @ 0x55d957a1cec0] mmco: unref short failure +[h264 @ 0x55d957a1cec0] mmco: unref short failure +[h264 @ 0x555dece9f180] mmco: unref short failure +[h264 @ 0x55d957a1cec0] mmco: unref short failure + [2024-11-28 09:07:11] iteration 774/ 1000 | consumed samples: 49536 | elapsed time per iteration (ms): 88647.4 | throughput per GPU (TFLOP/s/GPU): 87.0 | learning rate: 7.275264E-07 | global batch size: 64 | lm loss: 6.867324E-01 | loss scale: 1.0 | grad norm: 0.915 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x55d957a1cec0] mmco: unref short failure +[h264 @ 0x55d957a1cec0] mmco: unref short failure +[h264 @ 0x555dece9f180] mmco: unref short failure +[h264 @ 0x555dece9f180] mmco: unref short failure +[h264 @ 0x555dece9f180] mmco: unref short failure +[h264 @ 0x555dece9f180] mmco: unref short failure +[h264 @ 0x55d957a1cec0] mmco: unref short failure +[h264 @ 0x55d957a1cec0] mmco: unref short failure +[h264 @ 0x555dece9f180] mmco: unref short failure +[h264 @ 0x555dece9f180] mmco: unref short failure +[h264 @ 0x55d957a1cec0] mmco: unref short failure +[h264 @ 0x55d957a1cec0] mmco: unref short failure + [2024-11-28 09:08:24] iteration 775/ 1000 | consumed samples: 49600 | elapsed time per iteration (ms): 73074.2 | throughput per GPU (TFLOP/s/GPU): 105.5 | learning rate: 7.222328E-07 | global batch size: 64 | lm loss: 6.488966E-01 | loss scale: 1.0 | grad norm: 0.834 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x55d957a1cec0] mmco: unref short failure +[h264 @ 0x55d957a1cec0] mmco: unref short failure +[h264 @ 0x555dec836c00] mmco: unref short failure +[h264 @ 0x555dec836c00] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x55d95b718d00] mmco: unref short failure +[h264 @ 0x55d95b718d00] mmco: unref short failure +[h264 @ 0x55d95b718d00] mmco: unref short failure +[h264 @ 0x55d95b718d00] mmco: unref short failure + [2024-11-28 09:10:06] iteration 776/ 1000 | consumed samples: 49664 | elapsed time per iteration (ms): 102250.9 | throughput per GPU (TFLOP/s/GPU): 75.4 | learning rate: 7.169584E-07 | global batch size: 64 | lm loss: 6.472382E-01 | loss scale: 1.0 | grad norm: 1.003 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555dece9f180] mmco: unref short failure +[h264 @ 0x55d9560849c0] mmco: unref short failure +[h264 @ 0x555dec836c00] mmco: unref short failure +[h264 @ 0x555dec836c00] mmco: unref short failure +[h264 @ 0x55d956f98f80] mmco: unref short failure +[h264 @ 0x55d956f98f80] mmco: unref short failure +[h264 @ 0x555dec836c00] mmco: unref short failure +[h264 @ 0x55d956f98f80] mmco: unref short failure +[h264 @ 0x555dec836c00] mmco: unref short failure +[h264 @ 0x55d956f98f80] mmco: unref short failure +[h264 @ 0x555dec836c00] mmco: unref short failure +[h264 @ 0x555dec836c00] mmco: unref short failure +[h264 @ 0x55d956f98f80] mmco: unref short failure +[h264 @ 0x55d956f98f80] mmco: unref short failure +[h264 @ 0x555dece9f180] mmco: unref short failure +[h264 @ 0x555dece9f180] mmco: unref short failure +[h264 @ 0x555dece9f180] mmco: unref short failure +[h264 @ 0x55d9560849c0] mmco: unref short failure +[h264 @ 0x55d9560849c0] mmco: unref short failure +[h264 @ 0x55d9560849c0] mmco: unref short failure +[h264 @ 0x555dece9f180] mmco: unref short failure +[h264 @ 0x55d9560849c0] mmco: unref short failure +[h264 @ 0x555decb36640] mmco: unref short failure +[h264 @ 0x555decb36640] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x555decb36640] mmco: unref short failure +[h264 @ 0x555decb36640] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure + [2024-11-28 09:11:37] iteration 777/ 1000 | consumed samples: 49728 | elapsed time per iteration (ms): 90431.0 | throughput per GPU (TFLOP/s/GPU): 85.2 | learning rate: 7.117032E-07 | global batch size: 64 | lm loss: 7.486657E-01 | loss scale: 1.0 | grad norm: 1.617 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x55d9574d90c0] mmco: unref short failure + [2024-11-28 09:13:11] iteration 778/ 1000 | consumed samples: 49792 | elapsed time per iteration (ms): 94071.2 | throughput per GPU (TFLOP/s/GPU): 81.9 | learning rate: 7.064673E-07 | global batch size: 64 | lm loss: 6.917417E-01 | loss scale: 1.0 | grad norm: 1.739 | number of skipped iterations: 0 | number of nan iterations: 0 | + [2024-11-28 09:14:34] iteration 779/ 1000 | consumed samples: 49856 | elapsed time per iteration (ms): 83680.9 | throughput per GPU (TFLOP/s/GPU): 92.1 | learning rate: 7.012508E-07 | global batch size: 64 | lm loss: 6.594355E-01 | loss scale: 1.0 | grad norm: 0.903 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x555decb36640] mmco: unref short failure +[h264 @ 0x555decb36640] mmco: unref short failure +[h264 @ 0x55d9560849c0] mmco: unref short failure +[h264 @ 0x55d9560849c0] mmco: unref short failure +[h264 @ 0x555dece9f180] mmco: unref short failure +[h264 @ 0x555dece9f180] mmco: unref short failure +[h264 @ 0x55d9560849c0] mmco: unref short failure +[h264 @ 0x555dece9f180] mmco: unref short failure +[h264 @ 0x55d9560849c0] mmco: unref short failure +[h264 @ 0x555dece9f180] mmco: unref short failure +[h264 @ 0x55d9560849c0] mmco: unref short failure +[h264 @ 0x55d9560849c0] mmco: unref short failure +[h264 @ 0x555dece9f180] mmco: unref short failure +[h264 @ 0x555dece9f180] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x555decb36640] mmco: unref short failure +[h264 @ 0x555decb36640] mmco: unref short failure +[h264 @ 0x555dece9f180] mmco: unref short failure +[h264 @ 0x555dece9f180] mmco: unref short failure +[h264 @ 0x55d9560849c0] mmco: unref short failure +[h264 @ 0x55d9560849c0] mmco: unref short failure +[h264 @ 0x555dece9f180] mmco: unref short failure +[h264 @ 0x555dece9f180] mmco: unref short failure +[h264 @ 0x55d9560849c0] mmco: unref short failure +[h264 @ 0x55d9560849c0] mmco: unref short failure + [2024-11-28 09:16:00] iteration 780/ 1000 | consumed samples: 49920 | elapsed time per iteration (ms): 85537.9 | throughput per GPU (TFLOP/s/GPU): 90.1 | learning rate: 6.960536E-07 | global batch size: 64 | lm loss: 6.973290E-01 | loss scale: 1.0 | grad norm: 1.559 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x55d956f98f80] mmco: unref short failure +[h264 @ 0x55d956f98f80] mmco: unref short failure +[h264 @ 0x555dec836c00] mmco: unref short failure +[h264 @ 0x555dec836c00] mmco: unref short failure + [2024-11-28 09:17:25] iteration 781/ 1000 | consumed samples: 49984 | elapsed time per iteration (ms): 84911.3 | throughput per GPU (TFLOP/s/GPU): 90.8 | learning rate: 6.908759E-07 | global batch size: 64 | lm loss: 6.615264E-01 | loss scale: 1.0 | grad norm: 0.959 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555deee52400] mmco: unref short failure +[h264 @ 0x55d9567ebec0] mmco: unref short failure +[h264 @ 0x555dece9f180] mmco: unref short failure +[h264 @ 0x55d9560849c0] mmco: unref short failure +[h264 @ 0x555dece9f180] mmco: unref short failure +[h264 @ 0x55d9560849c0] mmco: unref short failure +[h264 @ 0x55d9567ebec0] mmco: unref short failure +[h264 @ 0x555deee52400] mmco: unref short failure +[h264 @ 0x555deee52400] mmco: unref short failure +[h264 @ 0x555deee52400] mmco: unref short failure +[h264 @ 0x55d9567ebec0] mmco: unref short failure +[h264 @ 0x55d9567ebec0] mmco: unref short failure +[h264 @ 0x555dec836c00] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x555dec836c00] mmco: unref short failure +[h264 @ 0x555dec836c00] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x55d9574d90c0] mmco: unref short failure +[h264 @ 0x55d9574d90c0] mmco: unref short failure +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x55d9574d90c0] mmco: unref short failure +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x55d9574d90c0] mmco: unref short failure +[h264 @ 0x55d9574d90c0] mmco: unref short failure +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x55d9574d90c0] mmco: unref short failure +[h264 @ 0x55d9574d90c0] mmco: unref short failure +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x55d9574d90c0] mmco: unref short failure +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x555dec836c00] mmco: unref short failure +[h264 @ 0x555dec836c00] mmco: unref short failure +[h264 @ 0x555dec836c00] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x555dec836c00] mmco: unref short failure +[h264 @ 0x555dec836c00] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x555dec836c00] mmco: unref short failure +[h264 @ 0x555dec836c00] mmco: unref short failure +[h264 @ 0x555dec836c00] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x555dec836c00] mmco: unref short failure +[h264 @ 0x555dec836c00] mmco: unref short failure +[h264 @ 0x555dec836c00] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x555dec836c00] mmco: unref short failure +[h264 @ 0x555dec836c00] mmco: unref short failure +[h264 @ 0x555dec836c00] mmco: unref short failure +[h264 @ 0x555dec836c00] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x555dec836c00] mmco: unref short failure +[h264 @ 0x555dec836c00] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x555deee52400] mmco: unref short failure +[h264 @ 0x555deee52400] mmco: unref short failure +[h264 @ 0x55d9567ebec0] mmco: unref short failure +[h264 @ 0x55d9567ebec0] mmco: unref short failure + [2024-11-28 09:19:07] iteration 782/ 1000 | consumed samples: 50048 | elapsed time per iteration (ms): 101571.3 | throughput per GPU (TFLOP/s/GPU): 75.9 | learning rate: 6.857177E-07 | global batch size: 64 | lm loss: 6.533257E-01 | loss scale: 1.0 | grad norm: 0.905 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x55d956aae040] mmco: unref short failure +[h264 @ 0x555deee52400] mmco: unref short failure + [2024-11-28 09:20:28] iteration 783/ 1000 | consumed samples: 50112 | elapsed time per iteration (ms): 81688.9 | throughput per GPU (TFLOP/s/GPU): 94.4 | learning rate: 6.805790E-07 | global batch size: 64 | lm loss: 6.432073E-01 | loss scale: 1.0 | grad norm: 0.974 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555decb36640] mmco: unref short failure +[h264 @ 0x555decb36640] mmco: unref short failure +[h264 @ 0x55d956f98f80] mmco: unref short failure +[h264 @ 0x55d956f98f80] mmco: unref short failure +[h264 @ 0x555decb36640] mmco: unref short failure +[h264 @ 0x555decb36640] mmco: unref short failure +[h264 @ 0x55d956f98f80] mmco: unref short failure +[h264 @ 0x55d956f98f80] mmco: unref short failure +[h264 @ 0x555decb36640] mmco: unref short failure +[h264 @ 0x55d956f98f80] mmco: unref short failure +[h264 @ 0x555decb36640] mmco: unref short failure +[h264 @ 0x555decb36640] mmco: unref short failure +[h264 @ 0x555decb36640] mmco: unref short failure +[h264 @ 0x555decb36640] mmco: unref short failure +[h264 @ 0x55d956f98f80] mmco: unref short failure +[h264 @ 0x55d956f98f80] mmco: unref short failure +[h264 @ 0x55d956f98f80] mmco: unref short failure +[h264 @ 0x55d956f98f80] mmco: unref short failure +[h264 @ 0x555decb36640] mmco: unref short failure +[h264 @ 0x555decb36640] mmco: unref short failure +[h264 @ 0x55d956f98f80] mmco: unref short failure +[h264 @ 0x55d956f98f80] mmco: unref short failure +[h264 @ 0x555decb36640] mmco: unref short failure +[h264 @ 0x555decb36640] mmco: unref short failure +[h264 @ 0x55d956f98f80] mmco: unref short failure +[h264 @ 0x55d956f98f80] mmco: unref short failure +[h264 @ 0x555decb36640] mmco: unref short failure +[h264 @ 0x55d956f98f80] mmco: unref short failure + [2024-11-28 09:22:00] iteration 784/ 1000 | consumed samples: 50176 | elapsed time per iteration (ms): 91262.4 | throughput per GPU (TFLOP/s/GPU): 84.5 | learning rate: 6.754599E-07 | global batch size: 64 | lm loss: 7.190090E-01 | loss scale: 1.0 | grad norm: 0.937 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555defd60880] mmco: unref short failure +[h264 @ 0x55d95a292180] mmco: unref short failure +[h264 @ 0x555dec836c00] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure + [2024-11-28 09:23:47] iteration 785/ 1000 | consumed samples: 50240 | elapsed time per iteration (ms): 107395.2 | throughput per GPU (TFLOP/s/GPU): 71.8 | learning rate: 6.703605E-07 | global batch size: 64 | lm loss: 6.706704E-01 | loss scale: 1.0 | grad norm: 0.855 | number of skipped iterations: 0 | number of nan iterations: 0 | + [2024-11-28 09:25:05] iteration 786/ 1000 | consumed samples: 50304 | elapsed time per iteration (ms): 78147.3 | throughput per GPU (TFLOP/s/GPU): 98.6 | learning rate: 6.652809E-07 | global batch size: 64 | lm loss: 7.410614E-01 | loss scale: 1.0 | grad norm: 0.959 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555deee52400] mmco: unref short failure +[h264 @ 0x55d9586ad200] mmco: unref short failure +[h264 @ 0x555deee52400] mmco: unref short failure +[h264 @ 0x555deee52400] mmco: unref short failure +[h264 @ 0x55d9586ad200] mmco: unref short failure +[h264 @ 0x55d9586ad200] mmco: unref short failure + [2024-11-28 09:26:31] iteration 787/ 1000 | consumed samples: 50368 | elapsed time per iteration (ms): 85945.0 | throughput per GPU (TFLOP/s/GPU): 89.7 | learning rate: 6.602210E-07 | global batch size: 64 | lm loss: 6.612120E-01 | loss scale: 1.0 | grad norm: 1.331 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555df4961e80] mmco: unref short failure +[h264 @ 0x555df4961e80] mmco: unref short failure +[h264 @ 0x55d957a03080] mmco: unref short failure +[h264 @ 0x55d957a03080] mmco: unref short failure +[h264 @ 0x555df4961e80] mmco: unref short failure +[h264 @ 0x55d957a03080] mmco: unref short failure +[h264 @ 0x555df4961e80] mmco: unref short failure +[h264 @ 0x555df4961e80] mmco: unref short failure +[h264 @ 0x55d957fbfd40] mmco: unref short failure +[h264 @ 0x55d957fbfd40] mmco: unref short failure +[h264 @ 0x555df4961e80] mmco: unref short failure +[h264 @ 0x555df4961e80] mmco: unref short failure +[h264 @ 0x55d957fbfd40] mmco: unref short failure +[h264 @ 0x55d957fbfd40] mmco: unref short failure +[h264 @ 0x555df4961e80] mmco: unref short failure +[h264 @ 0x555df4961e80] mmco: unref short failure +[h264 @ 0x55d957fbfd40] mmco: unref short failure +[h264 @ 0x55d957fbfd40] mmco: unref short failure +[h264 @ 0x55d957fbfd40] mmco: unref short failure +[h264 @ 0x55d957fbfd40] mmco: unref short failure +[h264 @ 0x555df4961e80] mmco: unref short failure +[h264 @ 0x555df4961e80] mmco: unref short failure +[h264 @ 0x55d957fbfd40] mmco: unref short failure +[h264 @ 0x55d957fbfd40] mmco: unref short failure +[h264 @ 0x555df4961e80] mmco: unref short failure +[h264 @ 0x555df4961e80] mmco: unref short failure +[h264 @ 0x55d957fbfd40] mmco: unref short failure +[h264 @ 0x555df4961e80] mmco: unref short failure + [2024-11-28 09:27:49] iteration 788/ 1000 | consumed samples: 50432 | elapsed time per iteration (ms): 77740.8 | throughput per GPU (TFLOP/s/GPU): 99.2 | learning rate: 6.551809E-07 | global batch size: 64 | lm loss: 7.221023E-01 | loss scale: 1.0 | grad norm: 0.928 | number of skipped iterations: 0 | number of nan iterations: 0 | + [2024-11-28 09:29:00] iteration 789/ 1000 | consumed samples: 50496 | elapsed time per iteration (ms): 71128.4 | throughput per GPU (TFLOP/s/GPU): 108.4 | learning rate: 6.501607E-07 | global batch size: 64 | lm loss: 7.365326E-01 | loss scale: 1.0 | grad norm: 1.031 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555df4961e80] mmco: unref short failure +[h264 @ 0x555df4961e80] mmco: unref short failure +[h264 @ 0x55d956f36e40] mmco: unref short failure +[h264 @ 0x55d956f36e40] mmco: unref short failure +[h264 @ 0x555df4961e80] mmco: unref short failure +[h264 @ 0x55d956f36e40] mmco: unref short failure +[h264 @ 0x555dec836c00] mmco: unref short failure +[h264 @ 0x555dec836c00] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x555dec836c00] mmco: unref short failure +[h264 @ 0x555dec836c00] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x555dec836c00] mmco: unref short failure +[h264 @ 0x555dec836c00] mmco: unref short failure +[h264 @ 0x555ded0b5480] mmco: unref short failure +[h264 @ 0x555ded0b5480] mmco: unref short failure +[h264 @ 0x55d956767080] mmco: unref short failure +[h264 @ 0x55d956767080] mmco: unref short failure +[h264 @ 0x555ded0b5480] mmco: unref short failure +[h264 @ 0x555ded0b5480] mmco: unref short failure +[h264 @ 0x55d956767080] mmco: unref short failure +[h264 @ 0x55d956767080] mmco: unref short failure +[h264 @ 0x555defd60880] mmco: unref short failure +[h264 @ 0x55d9567ebec0] mmco: unref short failure +[h264 @ 0x55d959cd93c0] mmco: unref short failure +[h264 @ 0x555df4961e80] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure + [2024-11-28 09:30:29] iteration 790/ 1000 | consumed samples: 50560 | elapsed time per iteration (ms): 89353.3 | throughput per GPU (TFLOP/s/GPU): 86.3 | learning rate: 6.451604E-07 | global batch size: 64 | lm loss: 6.473880E-01 | loss scale: 1.0 | grad norm: 0.865 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x555dec836c00] mmco: unref short failure +[h264 @ 0x555dec836c00] mmco: unref short failure + [2024-11-28 09:31:45] iteration 791/ 1000 | consumed samples: 50624 | elapsed time per iteration (ms): 75220.3 | throughput per GPU (TFLOP/s/GPU): 102.5 | learning rate: 6.401801E-07 | global batch size: 64 | lm loss: 6.595445E-01 | loss scale: 1.0 | grad norm: 0.845 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555defd60880] mmco: unref short failure +[h264 @ 0x55d95a292180] mmco: unref short failure +[h264 @ 0x555dee792700] mmco: unref short failure +[h264 @ 0x555dec836c00] mmco: unref short failure +[h264 @ 0x555dec836c00] mmco: unref short failure +[h264 @ 0x55d957cd6140] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x555dec836c00] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x555defd60880] mmco: unref short failure +[h264 @ 0x55d95a292180] mmco: unref short failure + [2024-11-28 09:33:20] iteration 792/ 1000 | consumed samples: 50688 | elapsed time per iteration (ms): 95120.8 | throughput per GPU (TFLOP/s/GPU): 81.0 | learning rate: 6.352198E-07 | global batch size: 64 | lm loss: 6.879074E-01 | loss scale: 1.0 | grad norm: 0.819 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555dece9f180] mmco: unref short failure +[h264 @ 0x55d956d792c0] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x555dedb7a200] mmco: unref short failure +[h264 @ 0x555dedb7a200] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x555dedb7a200] mmco: unref short failure + [2024-11-28 09:34:50] iteration 793/ 1000 | consumed samples: 50752 | elapsed time per iteration (ms): 90072.9 | throughput per GPU (TFLOP/s/GPU): 85.6 | learning rate: 6.302797E-07 | global batch size: 64 | lm loss: 5.921465E-01 | loss scale: 1.0 | grad norm: 0.869 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x55d956f36840] mmco: unref short failure +[h264 @ 0x55d956f36840] mmco: unref short failure +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x555deee52400] mmco: unref short failure +[h264 @ 0x55d958e05fc0] mmco: unref short failure +[h264 @ 0x555deee52400] mmco: unref short failure +[h264 @ 0x555deee52400] mmco: unref short failure +[h264 @ 0x55d958e05fc0] mmco: unref short failure +[h264 @ 0x55d958e05fc0] mmco: unref short failure +[h264 @ 0x555decb36640] mmco: unref short failure +[h264 @ 0x555decb36640] mmco: unref short failure +[h264 @ 0x55d956f98f80] mmco: unref short failure +[h264 @ 0x55d956f98f80] mmco: unref short failure +[h264 @ 0x555dece9f180] mmco: unref short failure +[h264 @ 0x555dece9f180] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure + [2024-11-28 09:36:20] iteration 794/ 1000 | consumed samples: 50816 | elapsed time per iteration (ms): 89985.7 | throughput per GPU (TFLOP/s/GPU): 85.7 | learning rate: 6.253596E-07 | global batch size: 64 | lm loss: 7.055615E-01 | loss scale: 1.0 | grad norm: 0.736 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555df4961e80] mmco: unref short failure +[h264 @ 0x55d955d24640] mmco: unref short failure + [2024-11-28 09:38:08] iteration 795/ 1000 | consumed samples: 50880 | elapsed time per iteration (ms): 107875.5 | throughput per GPU (TFLOP/s/GPU): 71.5 | learning rate: 6.204598E-07 | global batch size: 64 | lm loss: 6.044904E-01 | loss scale: 1.0 | grad norm: 0.835 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555decb36640] mmco: unref short failure +[h264 @ 0x555decb36640] mmco: unref short failure +[h264 @ 0x55d956f98f80] mmco: unref short failure +[h264 @ 0x55d956f98f80] mmco: unref short failure +[h264 @ 0x55d956386f00] mmco: unref short failure +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x55d956386f00] mmco: unref short failure +[h264 @ 0x55d956386f00] mmco: unref short failure +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x55d956386f00] mmco: unref short failure +[h264 @ 0x55d956386f00] mmco: unref short failure +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x55d957cd6140] mmco: unref short failure +[h264 @ 0x555dee792700] mmco: unref short failure + [2024-11-28 09:39:48] iteration 796/ 1000 | consumed samples: 50944 | elapsed time per iteration (ms): 100652.8 | throughput per GPU (TFLOP/s/GPU): 76.6 | learning rate: 6.155801E-07 | global batch size: 64 | lm loss: 6.538367E-01 | loss scale: 1.0 | grad norm: 0.899 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555dee792700] mmco: unref short failure +[h264 @ 0x555dee792700] mmco: unref short failure +[h264 @ 0x55d957cd6140] mmco: unref short failure +[h264 @ 0x55d957cd6140] mmco: unref short failure +[h264 @ 0x55d9559dfc40] mmco: unref short failure +[h264 @ 0x55d9559dfc40] mmco: unref short failure +[h264 @ 0x555df4961e80] mmco: unref short failure +[h264 @ 0x555df4961e80] mmco: unref short failure +[h264 @ 0x555decb36640] mmco: unref short failure +[h264 @ 0x555decb36640] mmco: unref short failure +[h264 @ 0x55d956f98f80] mmco: unref short failure +[h264 @ 0x55d956f98f80] mmco: unref short failure + [2024-11-28 09:41:02] iteration 797/ 1000 | consumed samples: 51008 | elapsed time per iteration (ms): 73916.1 | throughput per GPU (TFLOP/s/GPU): 104.3 | learning rate: 6.107208E-07 | global batch size: 64 | lm loss: 6.171812E-01 | loss scale: 1.0 | grad norm: 0.801 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x55d958e05fc0] mmco: unref short failure +[h264 @ 0x555deee52400] mmco: unref short failure +[h264 @ 0x55d958e05fc0] mmco: unref short failure +[h264 @ 0x55d958e05fc0] mmco: unref short failure +[h264 @ 0x555deee52400] mmco: unref short failure +[h264 @ 0x555deee52400] mmco: unref short failure +[h264 @ 0x55d958e05fc0] mmco: unref short failure +[h264 @ 0x55d958e05fc0] mmco: unref short failure +[h264 @ 0x555deee52400] mmco: unref short failure +[h264 @ 0x555deee52400] mmco: unref short failure +[h264 @ 0x55d958e05fc0] mmco: unref short failure +[h264 @ 0x555deee52400] mmco: unref short failure + [2024-11-28 09:42:24] iteration 798/ 1000 | consumed samples: 51072 | elapsed time per iteration (ms): 81723.2 | throughput per GPU (TFLOP/s/GPU): 94.3 | learning rate: 6.058818E-07 | global batch size: 64 | lm loss: 6.591862E-01 | loss scale: 1.0 | grad norm: 0.869 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x55d95b718d00] mmco: unref short failure +[h264 @ 0x55d95b718d00] mmco: unref short failure +[h264 @ 0x555ded679600] mmco: unref short failure +[h264 @ 0x555ded679600] mmco: unref short failure +[h264 @ 0x55d95b718d00] mmco: unref short failure +[h264 @ 0x555ded679600] mmco: unref short failure +[h264 @ 0x55d95b718d00] mmco: unref short failure +[h264 @ 0x55d95b718d00] mmco: unref short failure +[h264 @ 0x555ded679600] mmco: unref short failure +[h264 @ 0x555ded679600] mmco: unref short failure +[h264 @ 0x55d95b718d00] mmco: unref short failure +[h264 @ 0x555ded679600] mmco: unref short failure +[h264 @ 0x55d95b718d00] mmco: unref short failure +[h264 @ 0x55d95b718d00] mmco: unref short failure +[h264 @ 0x555ded679600] mmco: unref short failure +[h264 @ 0x555ded679600] mmco: unref short failure +[h264 @ 0x55d956f36e40] mmco: unref short failure +[h264 @ 0x555def216fc0] mmco: unref short failure + [2024-11-28 09:43:44] iteration 799/ 1000 | consumed samples: 51136 | elapsed time per iteration (ms): 79627.9 | throughput per GPU (TFLOP/s/GPU): 96.8 | learning rate: 6.010633E-07 | global batch size: 64 | lm loss: 6.704946E-01 | loss scale: 1.0 | grad norm: 1.459 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555dece9f180] mmco: unref short failure +[h264 @ 0x555dece9f180] mmco: unref short failure +[h264 @ 0x555dece9f180] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x555dece9f180] mmco: unref short failure +[h264 @ 0x555dece9f180] mmco: unref short failure +[h264 @ 0x555dece9f180] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x555dece9f180] mmco: unref short failure +[h264 @ 0x555dece9f180] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x555dece9f180] mmco: unref short failure +[h264 @ 0x555dece9f180] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x555dee700e80] mmco: unref short failure +[h264 @ 0x555dee700e80] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x555dece9f180] mmco: unref short failure +[h264 @ 0x555dece9f180] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x55d955d4f640] mmco: unref short failure +[h264 @ 0x55d955d4f640] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x555dece9f180] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x55d955d4f640] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x55d955d4f640] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x55d955d4f640] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x555dece9f180] mmco: unref short failure +[h264 @ 0x555dece9f180] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x55d955d4f640] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x55d955d4f640] mmco: unref short failure +[h264 @ 0x55d955d4f640] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x55d955d4f640] mmco: unref short failure +[h264 @ 0x55d955d4f640] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x55d955d4f640] mmco: unref short failure +[h264 @ 0x55d955d4f640] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x55d955d4f640] mmco: unref short failure + [2024-11-28 09:44:55] iteration 800/ 1000 | consumed samples: 51200 | elapsed time per iteration (ms): 70975.0 | throughput per GPU (TFLOP/s/GPU): 108.6 | learning rate: 5.962651E-07 | global batch size: 64 | lm loss: 7.106262E-01 | loss scale: 1.0 | grad norm: 0.762 | number of skipped iterations: 0 | number of nan iterations: 0 | +(min, max) time across ranks (ms): + save-checkpoint ................................: (154771.64, 154771.93) +[h264 @ 0x555decea5e80] mmco: unref short failure +[h264 @ 0x555decea5e80] mmco: unref short failure +[h264 @ 0x55d956fbbd80] mmco: unref short failure +[h264 @ 0x55d956fbbd80] mmco: unref short failure +[h264 @ 0x555deee52400] mmco: unref short failure +[h264 @ 0x55d958e05fc0] mmco: unref short failure +[h264 @ 0x555dee700e80] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x555dee700e80] mmco: unref short failure +[h264 @ 0x555dee700e80] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure + [2024-11-28 09:48:58] iteration 801/ 1000 | consumed samples: 51264 | elapsed time per iteration (ms): 88671.1 | throughput per GPU (TFLOP/s/GPU): 86.9 | learning rate: 5.914875E-07 | global batch size: 64 | lm loss: 7.269337E-01 | loss scale: 1.0 | grad norm: 1.033 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x55d955d4f640] mmco: unref short failure +[h264 @ 0x55d955d4f640] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x55d955d4f640] mmco: unref short failure +[h264 @ 0x55d955d4f640] mmco: unref short failure + [2024-11-28 09:50:27] iteration 802/ 1000 | consumed samples: 51328 | elapsed time per iteration (ms): 88770.2 | throughput per GPU (TFLOP/s/GPU): 86.8 | learning rate: 5.867304E-07 | global batch size: 64 | lm loss: 6.636480E-01 | loss scale: 1.0 | grad norm: 0.837 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555dece9f180] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x555dece9f180] mmco: unref short failure +[h264 @ 0x555dece9f180] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x555dece9f180] mmco: unref short failure +[h264 @ 0x555dece9f180] mmco: unref short failure + [2024-11-28 09:52:17] iteration 803/ 1000 | consumed samples: 51392 | elapsed time per iteration (ms): 109923.6 | throughput per GPU (TFLOP/s/GPU): 70.1 | learning rate: 5.819938E-07 | global batch size: 64 | lm loss: 7.249713E-01 | loss scale: 1.0 | grad norm: 0.836 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x55d958e05fc0] mmco: unref short failure +[h264 @ 0x555deee52400] mmco: unref short failure +[h264 @ 0x555deee52400] mmco: unref short failure +[h264 @ 0x55d9586ad200] mmco: unref short failure +[h264 @ 0x555deee52400] mmco: unref short failure +[h264 @ 0x555deee52400] mmco: unref short failure +[h264 @ 0x55d9586ad200] mmco: unref short failure +[h264 @ 0x55d9586ad200] mmco: unref short failure +[h264 @ 0x555deee52400] mmco: unref short failure +[h264 @ 0x555deee52400] mmco: unref short failure +[h264 @ 0x55d9586ad200] mmco: unref short failure +[h264 @ 0x55d9586ad200] mmco: unref short failure + [2024-11-28 09:53:48] iteration 804/ 1000 | consumed samples: 51456 | elapsed time per iteration (ms): 91267.2 | throughput per GPU (TFLOP/s/GPU): 84.5 | learning rate: 5.772780E-07 | global batch size: 64 | lm loss: 6.352175E-01 | loss scale: 1.0 | grad norm: 0.905 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x55d9593709c0] mmco: unref short failure +[h264 @ 0x55d9593709c0] mmco: unref short failure + [2024-11-28 09:55:04] iteration 805/ 1000 | consumed samples: 51520 | elapsed time per iteration (ms): 76083.1 | throughput per GPU (TFLOP/s/GPU): 101.3 | learning rate: 5.725828E-07 | global batch size: 64 | lm loss: 6.475290E-01 | loss scale: 1.0 | grad norm: 0.971 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x55d955b8bcc0] mmco: unref short failure +[h264 @ 0x55d955b8bcc0] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x55d959b29f40] mmco: unref short failure +[h264 @ 0x55d959b29f40] mmco: unref short failure +[h264 @ 0x55d9560849c0] mmco: unref short failure +[h264 @ 0x55d9560849c0] mmco: unref short failure +[h264 @ 0x555deee52400] mmco: unref short failure +[h264 @ 0x555dedfc6580] mmco: unref short failure +[h264 @ 0x555dedfc6580] mmco: unref short failure +[h264 @ 0x55d9586ad200] mmco: unref short failure +[h264 @ 0x55d9560849c0] mmco: unref short failure +[h264 @ 0x555dedfc6580] mmco: unref short failure +[h264 @ 0x55d9560849c0] mmco: unref short failure +[h264 @ 0x55d9560849c0] mmco: unref short failure +[h264 @ 0x555dedfc6580] mmco: unref short failure +[h264 @ 0x555dedfc6580] mmco: unref short failure +[h264 @ 0x55d9560849c0] mmco: unref short failure +[h264 @ 0x55d9560849c0] mmco: unref short failure +[h264 @ 0x555dedfc6580] mmco: unref short failure +[h264 @ 0x555dedfc6580] mmco: unref short failure +[h264 @ 0x55d9560849c0] mmco: unref short failure +[h264 @ 0x55d9560849c0] mmco: unref short failure +[h264 @ 0x555dedfc6580] mmco: unref short failure +[h264 @ 0x555dedfc6580] mmco: unref short failure +[h264 @ 0x55d9560849c0] mmco: unref short failure +[h264 @ 0x555dedfc6580] mmco: unref short failure + [2024-11-28 09:56:38] iteration 806/ 1000 | consumed samples: 51584 | elapsed time per iteration (ms): 94181.1 | throughput per GPU (TFLOP/s/GPU): 81.8 | learning rate: 5.679084E-07 | global batch size: 64 | lm loss: 7.037023E-01 | loss scale: 1.0 | grad norm: 1.058 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x55d955b8bcc0] mmco: unref short failure +[h264 @ 0x55d955b8bcc0] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x55d9560849c0] mmco: unref short failure +[h264 @ 0x555dedfc6580] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x555dee700e80] mmco: unref short failure +[h264 @ 0x55d9567ebec0] mmco: unref short failure +[h264 @ 0x555deee52400] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x555deee52400] mmco: unref short failure +[h264 @ 0x555dece38b80] mmco: unref short failure +[h264 @ 0x55d9581bc7c0] mmco: unref short failure +[h264 @ 0x555dece38b80] mmco: unref short failure +[h264 @ 0x55d9581bc7c0] mmco: unref short failure +[h264 @ 0x555dece38b80] mmco: unref short failure +[h264 @ 0x555dece38b80] mmco: unref short failure +[h264 @ 0x55d9581bc7c0] mmco: unref short failure +[h264 @ 0x55d9581bc7c0] mmco: unref short failure + [2024-11-28 09:57:48] iteration 807/ 1000 | consumed samples: 51648 | elapsed time per iteration (ms): 69970.7 | throughput per GPU (TFLOP/s/GPU): 110.2 | learning rate: 5.632547E-07 | global batch size: 64 | lm loss: 6.819202E-01 | loss scale: 1.0 | grad norm: 1.416 | number of skipped iterations: 0 | number of nan iterations: 0 | + [2024-11-28 09:59:06] iteration 808/ 1000 | consumed samples: 51712 | elapsed time per iteration (ms): 77503.9 | throughput per GPU (TFLOP/s/GPU): 99.5 | learning rate: 5.586219E-07 | global batch size: 64 | lm loss: 6.660787E-01 | loss scale: 1.0 | grad norm: 0.937 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x55d956da5240] mmco: unref short failure +[h264 @ 0x55d956da5240] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x55d956da5240] mmco: unref short failure +[h264 @ 0x55d956da5240] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x55d956da5240] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x55d956da5240] mmco: unref short failure +[h264 @ 0x55d956da5240] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x55d956da5240] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x55d956da5240] mmco: unref short failure +[h264 @ 0x55d956da5240] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x55d956da5240] mmco: unref short failure +[h264 @ 0x55d956da5240] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x55d956da5240] mmco: unref short failure +[h264 @ 0x55d956da5240] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x55d956da5240] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x55d956da5240] mmco: unref short failure + [2024-11-28 10:01:01] iteration 809/ 1000 | consumed samples: 51776 | elapsed time per iteration (ms): 115416.3 | throughput per GPU (TFLOP/s/GPU): 66.8 | learning rate: 5.540100E-07 | global batch size: 64 | lm loss: 6.367390E-01 | loss scale: 1.0 | grad norm: 0.986 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x55d959b34680] mmco: unref short failure +[h264 @ 0x55d959b34680] mmco: unref short failure +[h264 @ 0x555deee52400] mmco: unref short failure +[h264 @ 0x55d956defa80] mmco: unref short failure +[h264 @ 0x555deee52400] mmco: unref short failure +[h264 @ 0x55d956defa80] mmco: unref short failure +[h264 @ 0x555deee52400] mmco: unref short failure +[h264 @ 0x55d956defa80] mmco: unref short failure +[h264 @ 0x555deee52400] mmco: unref short failure +[h264 @ 0x55d956defa80] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x55d955b8bcc0] mmco: unref short failure + [2024-11-28 10:02:37] iteration 810/ 1000 | consumed samples: 51840 | elapsed time per iteration (ms): 95696.8 | throughput per GPU (TFLOP/s/GPU): 80.6 | learning rate: 5.494190E-07 | global batch size: 64 | lm loss: 7.794733E-01 | loss scale: 1.0 | grad norm: 0.945 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555dec5a5140] mmco: unref short failure +[h264 @ 0x555dec5a5140] mmco: unref short failure +[h264 @ 0x55d959b34680] mmco: unref short failure +[h264 @ 0x55d959b34680] mmco: unref short failure + [2024-11-28 10:04:12] iteration 811/ 1000 | consumed samples: 51904 | elapsed time per iteration (ms): 94858.7 | throughput per GPU (TFLOP/s/GPU): 81.3 | learning rate: 5.448490E-07 | global batch size: 64 | lm loss: 6.537660E-01 | loss scale: 1.0 | grad norm: 0.741 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555deee52400] mmco: unref short failure +[h264 @ 0x55d956ee5780] mmco: unref short failure + [2024-11-28 10:05:33] iteration 812/ 1000 | consumed samples: 51968 | elapsed time per iteration (ms): 81180.0 | throughput per GPU (TFLOP/s/GPU): 95.0 | learning rate: 5.403001E-07 | global batch size: 64 | lm loss: 6.626501E-01 | loss scale: 1.0 | grad norm: 0.828 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x55d955b8bcc0] mmco: unref short failure +[h264 @ 0x55d955b8bcc0] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x555dee700e80] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x55d955b8bcc0] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x55d955b8bcc0] mmco: unref short failure +[h264 @ 0x55d955b8bcc0] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x55d955b8bcc0] mmco: unref short failure +[h264 @ 0x55d955b8bcc0] mmco: unref short failure + [2024-11-28 10:06:58] iteration 813/ 1000 | consumed samples: 52032 | elapsed time per iteration (ms): 84762.2 | throughput per GPU (TFLOP/s/GPU): 90.9 | learning rate: 5.357722E-07 | global batch size: 64 | lm loss: 6.666785E-01 | loss scale: 1.0 | grad norm: 3.573 | number of skipped iterations: 0 | number of nan iterations: 0 | + [2024-11-28 10:08:18] iteration 814/ 1000 | consumed samples: 52096 | elapsed time per iteration (ms): 79762.7 | throughput per GPU (TFLOP/s/GPU): 96.6 | learning rate: 5.312654E-07 | global batch size: 64 | lm loss: 6.747634E-01 | loss scale: 1.0 | grad norm: 0.833 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555df1cb0600] mmco: unref short failure +[h264 @ 0x55d9567ebec0] mmco: unref short failure +[h264 @ 0x555dedfc6580] mmco: unref short failure +[h264 @ 0x55d9560849c0] mmco: unref short failure + [2024-11-28 10:09:25] iteration 815/ 1000 | consumed samples: 52160 | elapsed time per iteration (ms): 67840.9 | throughput per GPU (TFLOP/s/GPU): 113.6 | learning rate: 5.267799E-07 | global batch size: 64 | lm loss: 6.878562E-01 | loss scale: 1.0 | grad norm: 1.202 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x555deee52400] mmco: unref short failure +[h264 @ 0x55d9567ebec0] mmco: unref short failure + [2024-11-28 10:10:59] iteration 816/ 1000 | consumed samples: 52224 | elapsed time per iteration (ms): 93363.9 | throughput per GPU (TFLOP/s/GPU): 82.6 | learning rate: 5.223155E-07 | global batch size: 64 | lm loss: 6.529773E-01 | loss scale: 1.0 | grad norm: 1.001 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x55d955e4acc0] mmco: unref short failure +[h264 @ 0x55d955e4acc0] mmco: unref short failure +[h264 @ 0x555def251b40] mmco: unref short failure +[h264 @ 0x555def251b40] mmco: unref short failure +[h264 @ 0x555dee792700] mmco: unref short failure +[h264 @ 0x55d955d24640] mmco: unref short failure +[h264 @ 0x55d955d24640] mmco: unref short failure +[h264 @ 0x555dee792700] mmco: unref short failure + [2024-11-28 10:12:22] iteration 817/ 1000 | consumed samples: 52288 | elapsed time per iteration (ms): 83678.3 | throughput per GPU (TFLOP/s/GPU): 92.1 | learning rate: 5.178724E-07 | global batch size: 64 | lm loss: 7.590072E-01 | loss scale: 1.0 | grad norm: 0.930 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x55d957f0f640] mmco: unref short failure +[h264 @ 0x555df1cb0600] mmco: unref short failure +[h264 @ 0x55d956aae040] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x555deee52400] mmco: unref short failure +[h264 @ 0x555deee52400] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x55d957f0f640] mmco: unref short failure +[h264 @ 0x55d957f0f640] mmco: unref short failure +[h264 @ 0x55d956da5240] mmco: unref short failure +[h264 @ 0x55d956da5240] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x55d957f0f640] mmco: unref short failure +[h264 @ 0x55d957f0f640] mmco: unref short failure +[h264 @ 0x555dedfc6580] mmco: unref short failure +[h264 @ 0x555dedfc6580] mmco: unref short failure +[h264 @ 0x55d95678a600] mmco: unref short failure +[h264 @ 0x55d95678a600] mmco: unref short failure +[h264 @ 0x55d95b718d00] mmco: unref short failure +[h264 @ 0x55d95b718d00] mmco: unref short failure +[h264 @ 0x555dece9f180] mmco: unref short failure +[h264 @ 0x555dece9f180] mmco: unref short failure + [2024-11-28 10:13:31] iteration 818/ 1000 | consumed samples: 52352 | elapsed time per iteration (ms): 68109.4 | throughput per GPU (TFLOP/s/GPU): 113.2 | learning rate: 5.134507E-07 | global batch size: 64 | lm loss: 7.248871E-01 | loss scale: 1.0 | grad norm: 0.877 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x555dee700e80] mmco: unref short failure + [2024-11-28 10:15:02] iteration 819/ 1000 | consumed samples: 52416 | elapsed time per iteration (ms): 91126.7 | throughput per GPU (TFLOP/s/GPU): 84.6 | learning rate: 5.090503E-07 | global batch size: 64 | lm loss: 6.618973E-01 | loss scale: 1.0 | grad norm: 0.951 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555dece9f180] mmco: unref short failure +[h264 @ 0x555dece9f180] mmco: unref short failure +[h264 @ 0x555dece9f180] mmco: unref short failure +[h264 @ 0x55d95b718d00] mmco: unref short failure +[h264 @ 0x55d95b718d00] mmco: unref short failure +[h264 @ 0x55d95b718d00] mmco: unref short failure +[h264 @ 0x555dece9f180] mmco: unref short failure +[h264 @ 0x55d95b718d00] mmco: unref short failure + [2024-11-28 10:16:25] iteration 820/ 1000 | consumed samples: 52480 | elapsed time per iteration (ms): 83615.0 | throughput per GPU (TFLOP/s/GPU): 92.2 | learning rate: 5.046713E-07 | global batch size: 64 | lm loss: 6.375130E-01 | loss scale: 1.0 | grad norm: 0.770 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555def251b40] mmco: unref short failure +[h264 @ 0x555def251b40] mmco: unref short failure +[h264 @ 0x55d957fe7780] mmco: unref short failure +[h264 @ 0x55d957fe7780] mmco: unref short failure +[h264 @ 0x55d956defa80] mmco: unref short failure +[h264 @ 0x55d956defa80] mmco: unref short failure +[h264 @ 0x555dee0f8a80] mmco: unref short failure +[h264 @ 0x555dee0f8a80] mmco: unref short failure +[h264 @ 0x555def251b40] mmco: unref short failure +[h264 @ 0x55d957fe7780] mmco: unref short failure +[h264 @ 0x555dee700e80] mmco: unref short failure +[h264 @ 0x555dee700e80] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x555dee700e80] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x555dee700e80] mmco: unref short failure +[h264 @ 0x555dee700e80] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure + [2024-11-28 10:18:24] iteration 821/ 1000 | consumed samples: 52544 | elapsed time per iteration (ms): 118406.0 | throughput per GPU (TFLOP/s/GPU): 65.1 | learning rate: 5.003137E-07 | global batch size: 64 | lm loss: 6.340280E-01 | loss scale: 1.0 | grad norm: 0.794 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555deda84c00] mmco: unref short failure +[h264 @ 0x555deda84c00] mmco: unref short failure +[h264 @ 0x55d9593709c0] mmco: unref short failure +[h264 @ 0x55d9593709c0] mmco: unref short failure +[h264 @ 0x555deda84c00] mmco: unref short failure +[h264 @ 0x555deda84c00] mmco: unref short failure +[h264 @ 0x55d9593709c0] mmco: unref short failure +[h264 @ 0x55d9593709c0] mmco: unref short failure +[h264 @ 0x555dee700e80] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x555dece9f180] mmco: unref short failure +[h264 @ 0x555dece9f180] mmco: unref short failure +[h264 @ 0x555dece9f180] mmco: unref short failure +[h264 @ 0x555dece9f180] mmco: unref short failure +[h264 @ 0x55d95b718d00] mmco: unref short failure +[h264 @ 0x55d95b718d00] mmco: unref short failure +[h264 @ 0x555dece9f180] mmco: unref short failure +[h264 @ 0x555dece9f180] mmco: unref short failure +[h264 @ 0x555dece9f180] mmco: unref short failure +[h264 @ 0x55d95b718d00] mmco: unref short failure +[h264 @ 0x55d95b718d00] mmco: unref short failure +[h264 @ 0x55d95b718d00] mmco: unref short failure +[h264 @ 0x55d95b718d00] mmco: unref short failure +[h264 @ 0x55d95b718d00] mmco: unref short failure +[h264 @ 0x555dece9f180] mmco: unref short failure +[h264 @ 0x55d95b718d00] mmco: unref short failure + [2024-11-28 10:19:49] iteration 822/ 1000 | consumed samples: 52608 | elapsed time per iteration (ms): 85555.0 | throughput per GPU (TFLOP/s/GPU): 90.1 | learning rate: 4.959777E-07 | global batch size: 64 | lm loss: 6.423296E-01 | loss scale: 1.0 | grad norm: 0.895 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555dee700e80] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x555dece9f180] mmco: unref short failure +[h264 @ 0x55d95b718d00] mmco: unref short failure + [2024-11-28 10:21:05] iteration 823/ 1000 | consumed samples: 52672 | elapsed time per iteration (ms): 75694.4 | throughput per GPU (TFLOP/s/GPU): 101.8 | learning rate: 4.916632E-07 | global batch size: 64 | lm loss: 6.706414E-01 | loss scale: 1.0 | grad norm: 0.728 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x55d956f36840] mmco: unref short failure +[h264 @ 0x55d956f36840] mmco: unref short failure +[h264 @ 0x555deee52400] mmco: unref short failure +[h264 @ 0x555deee52400] mmco: unref short failure + [2024-11-28 10:22:34] iteration 824/ 1000 | consumed samples: 52736 | elapsed time per iteration (ms): 88878.5 | throughput per GPU (TFLOP/s/GPU): 86.7 | learning rate: 4.873703E-07 | global batch size: 64 | lm loss: 6.234448E-01 | loss scale: 1.0 | grad norm: 0.805 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x55d957fe7780] mmco: unref short failure +[h264 @ 0x55d957fe7780] mmco: unref short failure +[h264 @ 0x55d95b718d00] mmco: unref short failure +[h264 @ 0x555dec9d4800] mmco: unref short failure +[h264 @ 0x555dec9d4800] mmco: unref short failure +[h264 @ 0x555dece9f180] mmco: unref short failure +[h264 @ 0x55d95b718d00] mmco: unref short failure +[h264 @ 0x55d95b718d00] mmco: unref short failure +[h264 @ 0x555dece9f180] mmco: unref short failure +[h264 @ 0x555dece9f180] mmco: unref short failure +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x55d956767080] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x55d957a03080] mmco: unref short failure +[h264 @ 0x55d957a03080] mmco: unref short failure +[h264 @ 0x55d957a03080] mmco: unref short failure +[h264 @ 0x55d957a03080] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x55d957a03080] mmco: unref short failure +[h264 @ 0x55d957a03080] mmco: unref short failure +[h264 @ 0x555dedb7a200] mmco: unref short failure +[h264 @ 0x555dedb7a200] mmco: unref short failure +[h264 @ 0x55d957fe7780] mmco: unref short failure +[h264 @ 0x55d957fe7780] mmco: unref short failure + [2024-11-28 10:23:49] iteration 825/ 1000 | consumed samples: 52800 | elapsed time per iteration (ms): 75189.4 | throughput per GPU (TFLOP/s/GPU): 102.5 | learning rate: 4.830990E-07 | global batch size: 64 | lm loss: 6.897532E-01 | loss scale: 1.0 | grad norm: 0.922 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555df4961e80] mmco: unref short failure +[h264 @ 0x55d956f7b580] mmco: unref short failure +[h264 @ 0x55d956767080] mmco: unref short failure +[h264 @ 0x55d956767080] mmco: unref short failure +[h264 @ 0x555dedbd1740] mmco: unref short failure +[h264 @ 0x555dedbd1740] mmco: unref short failure +[h264 @ 0x555dee0f8a80] mmco: unref short failure +[h264 @ 0x55d956defa80] mmco: unref short failure +[h264 @ 0x555dedbd1740] mmco: unref short failure +[h264 @ 0x55d956767080] mmco: unref short failure +[h264 @ 0x555dee0f8a80] mmco: unref short failure +[h264 @ 0x55d956defa80] mmco: unref short failure +[h264 @ 0x555dee0f8a80] mmco: unref short failure +[h264 @ 0x55d956defa80] mmco: unref short failure + [2024-11-28 10:25:16] iteration 826/ 1000 | consumed samples: 52864 | elapsed time per iteration (ms): 87247.9 | throughput per GPU (TFLOP/s/GPU): 88.4 | learning rate: 4.788494E-07 | global batch size: 64 | lm loss: 7.539995E-01 | loss scale: 1.0 | grad norm: 0.943 | number of skipped iterations: 0 | number of nan iterations: 0 | + [2024-11-28 10:26:25] iteration 827/ 1000 | consumed samples: 52928 | elapsed time per iteration (ms): 68636.7 | throughput per GPU (TFLOP/s/GPU): 112.3 | learning rate: 4.746216E-07 | global batch size: 64 | lm loss: 5.952159E-01 | loss scale: 1.0 | grad norm: 0.816 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x55d956f537c0] mmco: unref short failure +[h264 @ 0x555def9a5ac0] mmco: unref short failure +[h264 @ 0x555deee52400] mmco: unref short failure +[h264 @ 0x55d956f36840] mmco: unref short failure +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x55d956767080] mmco: unref short failure +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x55d956767080] mmco: unref short failure + [2024-11-28 10:28:45] iteration 828/ 1000 | consumed samples: 52992 | elapsed time per iteration (ms): 140003.7 | throughput per GPU (TFLOP/s/GPU): 55.1 | learning rate: 4.704155E-07 | global batch size: 64 | lm loss: 6.752132E-01 | loss scale: 1.0 | grad norm: 0.856 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555ded0b5480] mmco: unref short failure +[h264 @ 0x555ded0b5480] mmco: unref short failure +[h264 @ 0x55d956da5240] mmco: unref short failure +[h264 @ 0x55d956da5240] mmco: unref short failure +[h264 @ 0x555ded0b5480] mmco: unref short failure +[h264 @ 0x55d956da5240] mmco: unref short failure +[h264 @ 0x555def9a5ac0] mmco: unref short failure +[h264 @ 0x55d956f537c0] mmco: unref short failure +[h264 @ 0x555defa21280] mmco: unref short failure +[h264 @ 0x555defa21280] mmco: unref short failure +[h264 @ 0x555defa21280] mmco: unref short failure +[h264 @ 0x55d956ee5780] mmco: unref short failure +[h264 @ 0x55d956ee5780] mmco: unref short failure +[h264 @ 0x55d956ee5780] mmco: unref short failure +[h264 @ 0x55d956fbbd80] mmco: unref short failure +[h264 @ 0x55d956fbbd80] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x555defa21280] mmco: unref short failure +[h264 @ 0x555defa21280] mmco: unref short failure +[h264 @ 0x55d956ee5780] mmco: unref short failure +[h264 @ 0x55d956ee5780] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x55d956fbbd80] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x55d956fbbd80] mmco: unref short failure +[h264 @ 0x55d956fbbd80] mmco: unref short failure + [2024-11-28 10:30:20] iteration 829/ 1000 | consumed samples: 53056 | elapsed time per iteration (ms): 95171.1 | throughput per GPU (TFLOP/s/GPU): 81.0 | learning rate: 4.662312E-07 | global batch size: 64 | lm loss: 6.471357E-01 | loss scale: 1.0 | grad norm: 0.823 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x55d959cd93c0] mmco: unref short failure +[h264 @ 0x55d959cd93c0] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x555df4961e80] mmco: unref short failure +[h264 @ 0x555df4961e80] mmco: unref short failure +[h264 @ 0x555def9a5ac0] mmco: unref short failure +[h264 @ 0x555def9a5ac0] mmco: unref short failure +[h264 @ 0x55d956f7b580] mmco: unref short failure +[h264 @ 0x55d956f7b580] mmco: unref short failure +[h264 @ 0x55d957fe7780] mmco: unref short failure +[h264 @ 0x555df1cb0600] mmco: unref short failure +[h264 @ 0x55d956f537c0] mmco: unref short failure +[h264 @ 0x55d956f537c0] mmco: unref short failure +[h264 @ 0x555def9a5ac0] mmco: unref short failure +[h264 @ 0x55d956f537c0] mmco: unref short failure +[h264 @ 0x55d957fe7780] mmco: unref short failure +[h264 @ 0x555df1cb0600] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x55d959cd93c0] mmco: unref short failure +[h264 @ 0x55d959cd93c0] mmco: unref short failure +[h264 @ 0x55d959cd93c0] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x55d959cd93c0] mmco: unref short failure +[h264 @ 0x555dece9f180] mmco: unref short failure +[h264 @ 0x555dece9f180] mmco: unref short failure +[h264 @ 0x55d95707f900] mmco: unref short failure +[h264 @ 0x55d95707f900] mmco: unref short failure +[h264 @ 0x555def9a5ac0] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x55d956f537c0] mmco: unref short failure +[h264 @ 0x55d959cd93c0] mmco: unref short failure +[h264 @ 0x555def9a5ac0] mmco: unref short failure +[h264 @ 0x55d956f537c0] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x55d959cd93c0] mmco: unref short failure +[h264 @ 0x55d957fe7780] mmco: unref short failure +[h264 @ 0x55d957fe7780] mmco: unref short failure +[h264 @ 0x555def9a5ac0] mmco: unref short failure +[h264 @ 0x555df1cb0600] mmco: unref short failure +[h264 @ 0x555df1cb0600] mmco: unref short failure +[h264 @ 0x55d956f537c0] mmco: unref short failure + [2024-11-28 10:31:46] iteration 830/ 1000 | consumed samples: 53120 | elapsed time per iteration (ms): 85230.2 | throughput per GPU (TFLOP/s/GPU): 90.4 | learning rate: 4.620688E-07 | global batch size: 64 | lm loss: 5.844744E-01 | loss scale: 1.0 | grad norm: 0.841 | number of skipped iterations: 0 | number of nan iterations: 0 | +processed_samples 3000 unjoint_samples 3000 joint_samples 205 [127422, 25206] +processed_samples 3000 unjoint_samples 3000 joint_samples 205 [127422, 25206] +[h264 @ 0x55d956f7b580] mmco: unref short failure +[h264 @ 0x555df4961e80] mmco: unref short failure +processed_samples 3000 unjoint_samples 3000 joint_samples 210 [115831, 29036] +processed_samples 3000 unjoint_samples 3000 joint_samples 210 [115831, 29036] +processed_samples 3000 unjoint_samples 3000 joint_samples 204 [124064, 55170] +processed_samples 3000 unjoint_samples 3000 joint_samples 204 [124064, 55170] +[h264 @ 0x55d956f7b580] mmco: unref short failure +[h264 @ 0x555df4961e80] mmco: unref short failure +processed_samples 3000 unjoint_samples 3000 joint_samples 210 [109963, 100082] +processed_samples 3000 unjoint_samples 3000 joint_samples 210 [109963, 100082] +processed_samples 3000 unjoint_samples 3000 joint_samples 215 [95111, 126828] +processed_samples 3000 unjoint_samples 3000 joint_samples 209 [116530, 129996] +processed_samples 3000 unjoint_samples 3000 joint_samples 215 [95111, 126828] +processed_samples 3000 unjoint_samples 3000 joint_samples 209 [116530, 129996] +processed_samples 3000 unjoint_samples 3000 joint_samples 207 [120032, 17059] +processed_samples 3000 unjoint_samples 3000 joint_samples 207 [120032, 17059] + [2024-11-28 10:32:55] iteration 831/ 1000 | consumed samples: 53184 | elapsed time per iteration (ms): 69294.3 | throughput per GPU (TFLOP/s/GPU): 111.2 | learning rate: 4.579283E-07 | global batch size: 64 | lm loss: 7.035551E-01 | loss scale: 1.0 | grad norm: 1.105 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x55d956defa80] mmco: unref short failure +[h264 @ 0x555ded0b5480] mmco: unref short failure +processed_samples 3001 unjoint_samples 3000 joint_samples 204 [96327, 114487] +processed_samples 3001 unjoint_samples 3000 joint_samples 204 [96327, 114487] +[h264 @ 0x555ded0b5480] mmco: unref short failure +[h264 @ 0x555ded0b5480] mmco: unref short failure +[h264 @ 0x55d956defa80] mmco: unref short failure +[h264 @ 0x55d956defa80] mmco: unref short failure +[h264 @ 0x555ded0b5480] mmco: unref short failure +[h264 @ 0x55d956defa80] mmco: unref short failure +[h264 @ 0x555dec9d4800] mmco: unref short failure +[h264 @ 0x555dec9d4800] mmco: unref short failure +[h264 @ 0x555deee52400] mmco: unref short failure +[h264 @ 0x555deee52400] mmco: unref short failure +[h264 @ 0x55d957fe7780] mmco: unref short failure +[h264 @ 0x55d957fe7780] mmco: unref short failure +[h264 @ 0x555dec9d4800] mmco: unref short failure +[h264 @ 0x555dec9d4800] mmco: unref short failure +[h264 @ 0x555dec9d4800] mmco: unref short failure +[h264 @ 0x55d956f36840] mmco: unref short failure +[h264 @ 0x55d956f36840] mmco: unref short failure +[h264 @ 0x55d957fe7780] mmco: unref short failure +[h264 @ 0x555dec9d4800] mmco: unref short failure +[h264 @ 0x555dec9d4800] mmco: unref short failure +[h264 @ 0x55d957fe7780] mmco: unref short failure +[h264 @ 0x55d957fe7780] mmco: unref short failure +[h264 @ 0x55d957fe7780] mmco: unref short failure +[h264 @ 0x55d957fe7780] mmco: unref short failure + [2024-11-28 10:34:11] iteration 832/ 1000 | consumed samples: 53248 | elapsed time per iteration (ms): 76534.5 | throughput per GPU (TFLOP/s/GPU): 100.7 | learning rate: 4.538097E-07 | global batch size: 64 | lm loss: 7.037050E-01 | loss scale: 1.0 | grad norm: 0.927 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x55d95c76c1c0] mmco: unref short failure +[h264 @ 0x555defd60880] mmco: unref short failure +[h264 @ 0x55d95742d280] mmco: unref short failure +[h264 @ 0x555df4961e80] mmco: unref short failure +[h264 @ 0x55d95742d280] mmco: unref short failure +[h264 @ 0x555df4961e80] mmco: unref short failure +[h264 @ 0x555df4961e80] mmco: unref short failure +[h264 @ 0x555df4961e80] mmco: unref short failure +[h264 @ 0x55d9560849c0] mmco: unref short failure +[h264 @ 0x55d9560849c0] mmco: unref short failure + [2024-11-28 10:35:35] iteration 833/ 1000 | consumed samples: 53312 | elapsed time per iteration (ms): 83738.5 | throughput per GPU (TFLOP/s/GPU): 92.1 | learning rate: 4.497131E-07 | global batch size: 64 | lm loss: 6.761808E-01 | loss scale: 1.0 | grad norm: 0.945 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x55d959cd93c0] mmco: unref short failure +[h264 @ 0x55d959cd93c0] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x55d959cd93c0] mmco: unref short failure +[h264 @ 0x55d959cd93c0] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x55d959cd93c0] mmco: unref short failure +[h264 @ 0x55d959cd93c0] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x55d959cd93c0] mmco: unref short failure + [2024-11-28 10:36:49] iteration 834/ 1000 | consumed samples: 53376 | elapsed time per iteration (ms): 73376.7 | throughput per GPU (TFLOP/s/GPU): 105.1 | learning rate: 4.456385E-07 | global batch size: 64 | lm loss: 7.318485E-01 | loss scale: 1.0 | grad norm: 0.844 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555df32c1f40] mmco: unref short failure +[h264 @ 0x555df32c1f40] mmco: unref short failure +[h264 @ 0x555df32c1f40] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure + [2024-11-28 10:38:26] iteration 835/ 1000 | consumed samples: 53440 | elapsed time per iteration (ms): 97012.1 | throughput per GPU (TFLOP/s/GPU): 79.5 | learning rate: 4.415861E-07 | global batch size: 64 | lm loss: 6.641833E-01 | loss scale: 1.0 | grad norm: 0.998 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x55d956767080] mmco: unref short failure +[h264 @ 0x55d956767080] mmco: unref short failure +[h264 @ 0x555dee700e80] mmco: unref short failure +[h264 @ 0x555dee700e80] mmco: unref short failure +[h264 @ 0x55d956767080] mmco: unref short failure +[h264 @ 0x55d956767080] mmco: unref short failure +[h264 @ 0x555dee700e80] mmco: unref short failure +[h264 @ 0x555dee700e80] mmco: unref short failure +[h264 @ 0x55d95c76c1c0] mmco: unref short failure +[h264 @ 0x55d95c76c1c0] mmco: unref short failure +[h264 @ 0x555defd60880] mmco: unref short failure +[h264 @ 0x555defd60880] mmco: unref short failure +[h264 @ 0x55d95c76c1c0] mmco: unref short failure +[h264 @ 0x555defd60880] mmco: unref short failure +[h264 @ 0x55d95c76c1c0] mmco: unref short failure +[h264 @ 0x55d95c76c1c0] mmco: unref short failure +[h264 @ 0x555defd60880] mmco: unref short failure +[h264 @ 0x555defd60880] mmco: unref short failure +[h264 @ 0x55d95c76c1c0] mmco: unref short failure +[h264 @ 0x55d95c76c1c0] mmco: unref short failure +[h264 @ 0x555defd60880] mmco: unref short failure +[h264 @ 0x555defd60880] mmco: unref short failure +[h264 @ 0x555df32c1f40] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x555df32c1f40] mmco: unref short failure +[h264 @ 0x555df32c1f40] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x55d956767080] mmco: unref short failure +[h264 @ 0x555dee700e80] mmco: unref short failure + [2024-11-28 10:39:44] iteration 836/ 1000 | consumed samples: 53504 | elapsed time per iteration (ms): 78840.2 | throughput per GPU (TFLOP/s/GPU): 97.8 | learning rate: 4.375557E-07 | global batch size: 64 | lm loss: 6.187526E-01 | loss scale: 1.0 | grad norm: 0.843 | number of skipped iterations: 0 | number of nan iterations: 0 | + [2024-11-28 10:41:09] iteration 837/ 1000 | consumed samples: 53568 | elapsed time per iteration (ms): 84164.7 | throughput per GPU (TFLOP/s/GPU): 91.6 | learning rate: 4.335475E-07 | global batch size: 64 | lm loss: 7.115659E-01 | loss scale: 1.0 | grad norm: 0.869 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555dee700e80] mmco: unref short failure +[h264 @ 0x55d95742d280] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x55d955d24640] mmco: unref short failure +[h264 @ 0x55d955d24640] mmco: unref short failure + [2024-11-28 10:42:43] iteration 838/ 1000 | consumed samples: 53632 | elapsed time per iteration (ms): 94853.6 | throughput per GPU (TFLOP/s/GPU): 81.3 | learning rate: 4.295615E-07 | global batch size: 64 | lm loss: 7.126722E-01 | loss scale: 1.0 | grad norm: 1.005 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x55d955d24640] mmco: unref short failure +[h264 @ 0x55d957fe7780] mmco: unref short failure +[h264 @ 0x555dedb7a200] mmco: unref short failure +[h264 @ 0x555dee700e80] mmco: unref short failure +[h264 @ 0x555dee700e80] mmco: unref short failure +[h264 @ 0x55d9560849c0] mmco: unref short failure +[h264 @ 0x55d9560849c0] mmco: unref short failure + [2024-11-28 10:44:07] iteration 839/ 1000 | consumed samples: 53696 | elapsed time per iteration (ms): 83725.2 | throughput per GPU (TFLOP/s/GPU): 92.1 | learning rate: 4.255977E-07 | global batch size: 64 | lm loss: 6.734149E-01 | loss scale: 1.0 | grad norm: 0.770 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555dee100dc0] mmco: unref short failure +[h264 @ 0x555dee100dc0] mmco: unref short failure +[h264 @ 0x55d957fe7780] mmco: unref short failure +[h264 @ 0x55d957fe7780] mmco: unref short failure +[h264 @ 0x555dedb7a200] mmco: unref short failure +[h264 @ 0x55d956386f00] mmco: unref short failure +[h264 @ 0x555dedb7a200] mmco: unref short failure +[h264 @ 0x55d956386f00] mmco: unref short failure + [2024-11-28 10:45:38] iteration 840/ 1000 | consumed samples: 53760 | elapsed time per iteration (ms): 91050.7 | throughput per GPU (TFLOP/s/GPU): 84.7 | learning rate: 4.216562E-07 | global batch size: 64 | lm loss: 6.599411E-01 | loss scale: 1.0 | grad norm: 1.389 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x55d959b34680] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x55d959b34680] mmco: unref short failure +[h264 @ 0x55d959b34680] mmco: unref short failure + [2024-11-28 10:47:12] iteration 841/ 1000 | consumed samples: 53824 | elapsed time per iteration (ms): 93607.0 | throughput per GPU (TFLOP/s/GPU): 82.3 | learning rate: 4.177371E-07 | global batch size: 64 | lm loss: 6.965013E-01 | loss scale: 1.0 | grad norm: 1.028 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555dee0de680] mmco: unref short failure +[h264 @ 0x555dee0de680] mmco: unref short failure +[h264 @ 0x55d9568d0900] mmco: unref short failure +[h264 @ 0x55d9568d0900] mmco: unref short failure +[h264 @ 0x555dee0de680] mmco: unref short failure +[h264 @ 0x55d9568d0900] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x55d959b34680] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x55d959b34680] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x55d959b34680] mmco: unref short failure + [2024-11-28 10:48:31] iteration 842/ 1000 | consumed samples: 53888 | elapsed time per iteration (ms): 79458.7 | throughput per GPU (TFLOP/s/GPU): 97.0 | learning rate: 4.138403E-07 | global batch size: 64 | lm loss: 6.894560E-01 | loss scale: 1.0 | grad norm: 1.232 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555deee52400] mmco: unref short failure +[h264 @ 0x555deee52400] mmco: unref short failure +[h264 @ 0x55d956defa80] mmco: unref short failure +[h264 @ 0x55d956defa80] mmco: unref short failure +[h264 @ 0x555deee52400] mmco: unref short failure +[h264 @ 0x555deee52400] mmco: unref short failure +[h264 @ 0x55d956defa80] mmco: unref short failure +[h264 @ 0x55d956defa80] mmco: unref short failure +[h264 @ 0x555deee52400] mmco: unref short failure +[h264 @ 0x555deee52400] mmco: unref short failure +[h264 @ 0x55d956defa80] mmco: unref short failure +[h264 @ 0x55d956defa80] mmco: unref short failure +[h264 @ 0x555deee52400] mmco: unref short failure +[h264 @ 0x55d956defa80] mmco: unref short failure +[h264 @ 0x555deee52400] mmco: unref short failure +[h264 @ 0x55d956defa80] mmco: unref short failure +[h264 @ 0x555deee52400] mmco: unref short failure +[h264 @ 0x55d956defa80] mmco: unref short failure +[h264 @ 0x555deee52400] mmco: unref short failure +[h264 @ 0x55d956defa80] mmco: unref short failure +[h264 @ 0x555deee52400] mmco: unref short failure +[h264 @ 0x555deee52400] mmco: unref short failure +[h264 @ 0x55d956defa80] mmco: unref short failure +[h264 @ 0x55d956defa80] mmco: unref short failure +[h264 @ 0x555deee52400] mmco: unref short failure +[h264 @ 0x55d956defa80] mmco: unref short failure +[h264 @ 0x555deee52400] mmco: unref short failure +[h264 @ 0x555deee52400] mmco: unref short failure +[h264 @ 0x55d956defa80] mmco: unref short failure +[h264 @ 0x55d956defa80] mmco: unref short failure +[h264 @ 0x555deee52400] mmco: unref short failure +[h264 @ 0x555deee52400] mmco: unref short failure +[h264 @ 0x55d956defa80] mmco: unref short failure +[h264 @ 0x55d956defa80] mmco: unref short failure +[h264 @ 0x555dee0de680] mmco: unref short failure +[h264 @ 0x555dee0de680] mmco: unref short failure +[h264 @ 0x55d955ffc880] mmco: unref short failure +[h264 @ 0x55d955ffc880] mmco: unref short failure + [2024-11-28 10:49:57] iteration 843/ 1000 | consumed samples: 53952 | elapsed time per iteration (ms): 85751.9 | throughput per GPU (TFLOP/s/GPU): 89.9 | learning rate: 4.099659E-07 | global batch size: 64 | lm loss: 6.982825E-01 | loss scale: 1.0 | grad norm: 0.842 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x555dee104680] mmco: unref short failure +[h264 @ 0x55d95742d280] mmco: unref short failure +[h264 @ 0x55d956da5240] mmco: unref short failure +[h264 @ 0x555dee104680] mmco: unref short failure +[h264 @ 0x555dee104680] mmco: unref short failure +[h264 @ 0x55d95742d280] mmco: unref short failure +[h264 @ 0x55d95742d280] mmco: unref short failure +[h264 @ 0x555dee104680] mmco: unref short failure +[h264 @ 0x555dee104680] mmco: unref short failure +[h264 @ 0x55d95742d280] mmco: unref short failure +[h264 @ 0x55d95742d280] mmco: unref short failure +[h264 @ 0x555dee104680] mmco: unref short failure +[h264 @ 0x555dee104680] mmco: unref short failure +[h264 @ 0x55d95742d280] mmco: unref short failure +[h264 @ 0x55d95742d280] mmco: unref short failure +[h264 @ 0x555dee104680] mmco: unref short failure +[h264 @ 0x55d95742d280] mmco: unref short failure +[h264 @ 0x555dee104680] mmco: unref short failure +[h264 @ 0x555dee104680] mmco: unref short failure +[h264 @ 0x55d956042040] mmco: unref short failure +[h264 @ 0x55d956042040] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x55d959b34680] mmco: unref short failure +[h264 @ 0x55d959b34680] mmco: unref short failure + [2024-11-28 10:51:28] iteration 844/ 1000 | consumed samples: 54016 | elapsed time per iteration (ms): 90570.1 | throughput per GPU (TFLOP/s/GPU): 85.1 | learning rate: 4.061140E-07 | global batch size: 64 | lm loss: 6.635154E-01 | loss scale: 1.0 | grad norm: 0.848 | number of skipped iterations: 0 | number of nan iterations: 0 | + [2024-11-28 10:53:04] iteration 845/ 1000 | consumed samples: 54080 | elapsed time per iteration (ms): 96328.2 | throughput per GPU (TFLOP/s/GPU): 80.0 | learning rate: 4.022845E-07 | global batch size: 64 | lm loss: 7.333788E-01 | loss scale: 1.0 | grad norm: 0.926 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x55d957fbfd40] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x55d957fbfd40] mmco: unref short failure +[h264 @ 0x555ded696c00] mmco: unref short failure +[h264 @ 0x555ded696c00] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure + [2024-11-28 10:54:30] iteration 846/ 1000 | consumed samples: 54144 | elapsed time per iteration (ms): 85899.4 | throughput per GPU (TFLOP/s/GPU): 89.7 | learning rate: 3.984776E-07 | global batch size: 64 | lm loss: 7.403162E-01 | loss scale: 1.0 | grad norm: 1.022 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x55d956defa80] mmco: unref short failure +[h264 @ 0x55d956defa80] mmco: unref short failure +[h264 @ 0x555deee52400] mmco: unref short failure +[h264 @ 0x555deee52400] mmco: unref short failure +[h264 @ 0x55d956defa80] mmco: unref short failure +[h264 @ 0x55d956defa80] mmco: unref short failure +[h264 @ 0x555deee52400] mmco: unref short failure +[h264 @ 0x555deee52400] mmco: unref short failure +[h264 @ 0x55d956defa80] mmco: unref short failure +[h264 @ 0x55d956defa80] mmco: unref short failure +[h264 @ 0x55d956defa80] mmco: unref short failure +[h264 @ 0x55d956defa80] mmco: unref short failure +[h264 @ 0x555deee52400] mmco: unref short failure +[h264 @ 0x555deee52400] mmco: unref short failure +[h264 @ 0x55d956defa80] mmco: unref short failure +[h264 @ 0x55d956defa80] mmco: unref short failure +[h264 @ 0x555deee52400] mmco: unref short failure +[h264 @ 0x555deee52400] mmco: unref short failure +[h264 @ 0x555deee52400] mmco: unref short failure +[h264 @ 0x555deee52400] mmco: unref short failure +[h264 @ 0x55d956defa80] mmco: unref short failure +[h264 @ 0x555deee52400] mmco: unref short failure +[h264 @ 0x55d956da5240] mmco: unref short failure +[h264 @ 0x55d956da5240] mmco: unref short failure +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x55d956da5240] mmco: unref short failure +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x55d956da5240] mmco: unref short failure +[h264 @ 0x555df333fac0] mmco: unref short failure + [2024-11-28 10:55:51] iteration 847/ 1000 | consumed samples: 54208 | elapsed time per iteration (ms): 81509.2 | throughput per GPU (TFLOP/s/GPU): 94.6 | learning rate: 3.946933E-07 | global batch size: 64 | lm loss: 6.359328E-01 | loss scale: 1.0 | grad norm: 0.801 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x55d955b8bcc0] mmco: unref short failure +[h264 @ 0x55d955b8bcc0] mmco: unref short failure +[h264 @ 0x555dee700e80] mmco: unref short failure +[h264 @ 0x555dee700e80] mmco: unref short failure +[h264 @ 0x55d95742d280] mmco: unref short failure +[h264 @ 0x55d95742d280] mmco: unref short failure +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x55d95742d280] mmco: unref short failure +[h264 @ 0x555df333fac0] mmco: unref short failure + [2024-11-28 10:57:18] iteration 848/ 1000 | consumed samples: 54272 | elapsed time per iteration (ms): 86457.1 | throughput per GPU (TFLOP/s/GPU): 89.2 | learning rate: 3.909315E-07 | global batch size: 64 | lm loss: 6.534768E-01 | loss scale: 1.0 | grad norm: 0.903 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555dedb7a200] mmco: unref short failure +[h264 @ 0x55d9560849c0] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x555dedb7a200] mmco: unref short failure +[h264 @ 0x555dedb7a200] mmco: unref short failure +[h264 @ 0x555dedb7a200] mmco: unref short failure +[h264 @ 0x55d9560849c0] mmco: unref short failure +[h264 @ 0x55d9560849c0] mmco: unref short failure +[h264 @ 0x55d9560849c0] mmco: unref short failure +[h264 @ 0x555dedb7a200] mmco: unref short failure +[h264 @ 0x555dedb7a200] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x55d9560849c0] mmco: unref short failure +[h264 @ 0x55d9560849c0] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure + [2024-11-28 10:59:08] iteration 849/ 1000 | consumed samples: 54336 | elapsed time per iteration (ms): 109584.1 | throughput per GPU (TFLOP/s/GPU): 70.3 | learning rate: 3.871925E-07 | global batch size: 64 | lm loss: 6.476867E-01 | loss scale: 1.0 | grad norm: 0.765 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x55d956f36840] mmco: unref short failure + [2024-11-28 11:00:45] iteration 850/ 1000 | consumed samples: 54400 | elapsed time per iteration (ms): 97565.7 | throughput per GPU (TFLOP/s/GPU): 79.0 | learning rate: 3.834760E-07 | global batch size: 64 | lm loss: 6.968692E-01 | loss scale: 1.0 | grad norm: 0.954 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555ded7e1040] mmco: unref short failure +[h264 @ 0x555ded7e1040] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure + [2024-11-28 11:02:06] iteration 851/ 1000 | consumed samples: 54464 | elapsed time per iteration (ms): 81200.0 | throughput per GPU (TFLOP/s/GPU): 94.9 | learning rate: 3.797824E-07 | global batch size: 64 | lm loss: 6.481150E-01 | loss scale: 1.0 | grad norm: 0.833 | number of skipped iterations: 0 | number of nan iterations: 0 | + [2024-11-28 11:03:33] iteration 852/ 1000 | consumed samples: 54528 | elapsed time per iteration (ms): 86955.3 | throughput per GPU (TFLOP/s/GPU): 88.6 | learning rate: 3.761115E-07 | global batch size: 64 | lm loss: 6.985618E-01 | loss scale: 1.0 | grad norm: 1.381 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x55d959cd93c0] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x555ded7e1040] mmco: unref short failure +[h264 @ 0x555ded7e1040] mmco: unref short failure + [2024-11-28 11:05:07] iteration 853/ 1000 | consumed samples: 54592 | elapsed time per iteration (ms): 93991.4 | throughput per GPU (TFLOP/s/GPU): 82.0 | learning rate: 3.724633E-07 | global batch size: 64 | lm loss: 6.918560E-01 | loss scale: 1.0 | grad norm: 0.751 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x55d956f36840] mmco: unref short failure +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x555dedfc6580] mmco: unref short failure +[h264 @ 0x55d956aae040] mmco: unref short failure +[h264 @ 0x555dedfc6580] mmco: unref short failure +[h264 @ 0x555dedfc6580] mmco: unref short failure +[h264 @ 0x55d956aae040] mmco: unref short failure +[h264 @ 0x55d956aae040] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x555dec9973c0] mmco: unref short failure + [2024-11-28 11:07:09] iteration 854/ 1000 | consumed samples: 54656 | elapsed time per iteration (ms): 122071.7 | throughput per GPU (TFLOP/s/GPU): 63.1 | learning rate: 3.688381E-07 | global batch size: 64 | lm loss: 6.708304E-01 | loss scale: 1.0 | grad norm: 0.839 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x55d956f36840] mmco: unref short failure +[h264 @ 0x555ded7e1040] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x555ded7e1040] mmco: unref short failure +[h264 @ 0x555ded7e1040] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x555ded7e1040] mmco: unref short failure +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x55d956f36840] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x55d95d112a80] mmco: unref short failure +[h264 @ 0x555dedfc6580] mmco: unref short failure +[h264 @ 0x55d95d112a80] mmco: unref short failure +[h264 @ 0x55d95d112a80] mmco: unref short failure +[h264 @ 0x555dedfc6580] mmco: unref short failure +[h264 @ 0x555dedfc6580] mmco: unref short failure +[h264 @ 0x55d95d112a80] mmco: unref short failure +[h264 @ 0x555dedfc6580] mmco: unref short failure +[h264 @ 0x55d95d112a80] mmco: unref short failure +[h264 @ 0x555dedfc6580] mmco: unref short failure +[h264 @ 0x55d95d112a80] mmco: unref short failure +[h264 @ 0x555dedfc6580] mmco: unref short failure +[h264 @ 0x55d95d112a80] mmco: unref short failure +[h264 @ 0x55d95d112a80] mmco: unref short failure +[h264 @ 0x555dedfc6580] mmco: unref short failure +[h264 @ 0x555dedfc6580] mmco: unref short failure +[h264 @ 0x55d95d112a80] mmco: unref short failure +[h264 @ 0x55d95d112a80] mmco: unref short failure +[h264 @ 0x555dedfc6580] mmco: unref short failure +[h264 @ 0x555dedfc6580] mmco: unref short failure + [2024-11-28 11:08:48] iteration 855/ 1000 | consumed samples: 54720 | elapsed time per iteration (ms): 99086.2 | throughput per GPU (TFLOP/s/GPU): 77.8 | learning rate: 3.652357E-07 | global batch size: 64 | lm loss: 5.969853E-01 | loss scale: 1.0 | grad norm: 1.076 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x55d956f36840] mmco: unref short failure +[h264 @ 0x55d959cd93c0] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x55d959cd93c0] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x55d959cd93c0] mmco: unref short failure +[h264 @ 0x55d959cd93c0] mmco: unref short failure +[h264 @ 0x55d959cd93c0] mmco: unref short failure +[h264 @ 0x555ded7e1040] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x55d959cd93c0] mmco: unref short failure +[h264 @ 0x55d959cd93c0] mmco: unref short failure +[h264 @ 0x555ded7e1040] mmco: unref short failure +[h264 @ 0x555ded7e1040] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x555ded7e1040] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x555ded7e1040] mmco: unref short failure +[h264 @ 0x555ded7e1040] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x55d959cd93c0] mmco: unref short failure +[h264 @ 0x55d959cd93c0] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x55d959cd93c0] mmco: unref short failure +[h264 @ 0x55d959cd93c0] mmco: unref short failure + [2024-11-28 11:10:39] iteration 856/ 1000 | consumed samples: 54784 | elapsed time per iteration (ms): 110343.7 | throughput per GPU (TFLOP/s/GPU): 69.9 | learning rate: 3.616562E-07 | global batch size: 64 | lm loss: 6.009317E-01 | loss scale: 1.0 | grad norm: 0.907 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x55d956da5240] mmco: unref short failure +[h264 @ 0x55d956da5240] mmco: unref short failure +[h264 @ 0x55d956da5240] mmco: unref short failure +[h264 @ 0x55d956da5240] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x555ded7e1040] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure + [2024-11-28 11:12:05] iteration 857/ 1000 | consumed samples: 54848 | elapsed time per iteration (ms): 86422.8 | throughput per GPU (TFLOP/s/GPU): 89.2 | learning rate: 3.580997E-07 | global batch size: 64 | lm loss: 7.241082E-01 | loss scale: 1.0 | grad norm: 1.027 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x555ded7e1040] mmco: unref short failure +[h264 @ 0x555ded7e1040] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x555ded7e1040] mmco: unref short failure +[h264 @ 0x555ded7e1040] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x555ded7e1040] mmco: unref short failure + [2024-11-28 11:13:29] iteration 858/ 1000 | consumed samples: 54912 | elapsed time per iteration (ms): 83739.2 | throughput per GPU (TFLOP/s/GPU): 92.1 | learning rate: 3.545662E-07 | global batch size: 64 | lm loss: 6.354287E-01 | loss scale: 1.0 | grad norm: 0.743 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x55d95d112a80] mmco: unref short failure +[h264 @ 0x55d95d112a80] mmco: unref short failure +[h264 @ 0x555df1cb0600] mmco: unref short failure +[h264 @ 0x555df1cb0600] mmco: unref short failure +[h264 @ 0x55d95d112a80] mmco: unref short failure +[h264 @ 0x55d95d112a80] mmco: unref short failure +[h264 @ 0x555df1cb0600] mmco: unref short failure +[h264 @ 0x555df1cb0600] mmco: unref short failure +[h264 @ 0x555df1cb0600] mmco: unref short failure +[h264 @ 0x55d95d112a80] mmco: unref short failure + [2024-11-28 11:15:01] iteration 859/ 1000 | consumed samples: 54976 | elapsed time per iteration (ms): 92087.2 | throughput per GPU (TFLOP/s/GPU): 83.7 | learning rate: 3.510557E-07 | global batch size: 64 | lm loss: 6.979592E-01 | loss scale: 1.0 | grad norm: 0.817 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x55d956386f00] mmco: unref short failure +[h264 @ 0x55d956386f00] mmco: unref short failure +[h264 @ 0x555dee6b8180] mmco: unref short failure +[h264 @ 0x555dee6b8180] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x555dedfc6580] mmco: unref short failure +[h264 @ 0x555dedfc6580] mmco: unref short failure +[h264 @ 0x55d957fe7780] mmco: unref short failure +[h264 @ 0x55d957fe7780] mmco: unref short failure +[h264 @ 0x555ded7e1040] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x555df32c1f40] mmco: unref short failure +[h264 @ 0x555dedfc6580] mmco: unref short failure +[h264 @ 0x555ded7e1040] mmco: unref short failure +[h264 @ 0x555ded7e1040] mmco: unref short failure +[h264 @ 0x55d956386f00] mmco: unref short failure +[h264 @ 0x55d957fe7780] mmco: unref short failure +[h264 @ 0x555ded7e1040] mmco: unref short failure +[h264 @ 0x555ded7e1040] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x555df32c1f40] mmco: unref short failure +[h264 @ 0x555df32c1f40] mmco: unref short failure +[h264 @ 0x55d956386f00] mmco: unref short failure +[h264 @ 0x55d956386f00] mmco: unref short failure + [2024-11-28 11:16:17] iteration 860/ 1000 | consumed samples: 55040 | elapsed time per iteration (ms): 75816.1 | throughput per GPU (TFLOP/s/GPU): 101.7 | learning rate: 3.475682E-07 | global batch size: 64 | lm loss: 6.640246E-01 | loss scale: 1.0 | grad norm: 1.084 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x555dedc5cf80] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure + [2024-11-28 11:17:45] iteration 861/ 1000 | consumed samples: 55104 | elapsed time per iteration (ms): 87869.5 | throughput per GPU (TFLOP/s/GPU): 87.7 | learning rate: 3.441039E-07 | global batch size: 64 | lm loss: 6.442217E-01 | loss scale: 1.0 | grad norm: 0.981 | number of skipped iterations: 0 | number of nan iterations: 0 | + [2024-11-28 11:19:15] iteration 862/ 1000 | consumed samples: 55168 | elapsed time per iteration (ms): 90642.3 | throughput per GPU (TFLOP/s/GPU): 85.0 | learning rate: 3.406627E-07 | global batch size: 64 | lm loss: 5.933395E-01 | loss scale: 1.0 | grad norm: 0.937 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555dedc5cf80] mmco: unref short failure +[h264 @ 0x555dedc5cf80] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x555ded7e1040] mmco: unref short failure +[h264 @ 0x555ded7e1040] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x555dedc5cf80] mmco: unref short failure +[h264 @ 0x555dedc5cf80] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x555dedc5cf80] mmco: unref short failure +[h264 @ 0x555dedc5cf80] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure + [2024-11-28 11:20:32] iteration 863/ 1000 | consumed samples: 55232 | elapsed time per iteration (ms): 76873.5 | throughput per GPU (TFLOP/s/GPU): 100.3 | learning rate: 3.372447E-07 | global batch size: 64 | lm loss: 6.333019E-01 | loss scale: 1.0 | grad norm: 0.810 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure + [2024-11-28 11:22:00] iteration 864/ 1000 | consumed samples: 55296 | elapsed time per iteration (ms): 87690.3 | throughput per GPU (TFLOP/s/GPU): 87.9 | learning rate: 3.338499E-07 | global batch size: 64 | lm loss: 6.640263E-01 | loss scale: 1.0 | grad norm: 1.172 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555ded0b5480] mmco: unref short failure +[h264 @ 0x555ded0b5480] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x55d956386f00] mmco: unref short failure +[h264 @ 0x55d956386f00] mmco: unref short failure +[h264 @ 0x555df32c1f40] mmco: unref short failure +[h264 @ 0x555df32c1f40] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x555dedc5cf80] mmco: unref short failure +[h264 @ 0x555dedc5cf80] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x555dedc5cf80] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x555dedc5cf80] mmco: unref short failure +[h264 @ 0x555dedc5cf80] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x555dedc5cf80] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure + [2024-11-28 11:23:55] iteration 865/ 1000 | consumed samples: 55360 | elapsed time per iteration (ms): 114569.4 | throughput per GPU (TFLOP/s/GPU): 67.3 | learning rate: 3.304783E-07 | global batch size: 64 | lm loss: 6.280389E-01 | loss scale: 1.0 | grad norm: 0.773 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555ded7e1040] mmco: unref short failure +[h264 @ 0x555ded7e1040] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure + [2024-11-28 11:25:10] iteration 866/ 1000 | consumed samples: 55424 | elapsed time per iteration (ms): 75815.6 | throughput per GPU (TFLOP/s/GPU): 101.7 | learning rate: 3.271301E-07 | global batch size: 64 | lm loss: 7.477145E-01 | loss scale: 1.0 | grad norm: 0.872 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555ded696c00] mmco: unref short failure +[h264 @ 0x55d9568d0900] mmco: unref short failure +[h264 @ 0x55d9565f0700] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x55d9565f0700] mmco: unref short failure +[h264 @ 0x555deee52400] mmco: unref short failure +[h264 @ 0x555deee52400] mmco: unref short failure +[h264 @ 0x555deee52400] mmco: unref short failure +[h264 @ 0x555deee52400] mmco: unref short failure +[h264 @ 0x55d95d112a80] mmco: unref short failure +[h264 @ 0x55d95d112a80] mmco: unref short failure +[h264 @ 0x55d95d112a80] mmco: unref short failure +[h264 @ 0x55d95d112a80] mmco: unref short failure + [2024-11-28 11:26:37] iteration 867/ 1000 | consumed samples: 55488 | elapsed time per iteration (ms): 86926.4 | throughput per GPU (TFLOP/s/GPU): 88.7 | learning rate: 3.238051E-07 | global batch size: 64 | lm loss: 6.348403E-01 | loss scale: 1.0 | grad norm: 0.826 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555df32c1f40] mmco: unref short failure +[h264 @ 0x55d956f7b580] mmco: unref short failure +[h264 @ 0x555df32c1f40] mmco: unref short failure +[h264 @ 0x555df32c1f40] mmco: unref short failure +[h264 @ 0x55d956f7b580] mmco: unref short failure +[h264 @ 0x55d956f7b580] mmco: unref short failure + [2024-11-28 11:28:47] iteration 868/ 1000 | consumed samples: 55552 | elapsed time per iteration (ms): 129771.7 | throughput per GPU (TFLOP/s/GPU): 59.4 | learning rate: 3.205035E-07 | global batch size: 64 | lm loss: 6.754884E-01 | loss scale: 1.0 | grad norm: 0.841 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x55d9568d0900] mmco: unref short failure +[h264 @ 0x555ded696c00] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x55d9565f0700] mmco: unref short failure +[h264 @ 0x555df4961e80] mmco: unref short failure +[h264 @ 0x55d9568d0900] mmco: unref short failure +[h264 @ 0x555ded696c00] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x555ded7e1040] mmco: unref short failure +[h264 @ 0x555ded7e1040] mmco: unref short failure +[h264 @ 0x55d9565f0700] mmco: unref short failure +[h264 @ 0x555df4961e80] mmco: unref short failure +[h264 @ 0x555ded696c00] mmco: unref short failure +[h264 @ 0x55d9568d0900] mmco: unref short failure + [2024-11-28 11:30:11] iteration 869/ 1000 | consumed samples: 55616 | elapsed time per iteration (ms): 83466.8 | throughput per GPU (TFLOP/s/GPU): 92.4 | learning rate: 3.172253E-07 | global batch size: 64 | lm loss: 6.710061E-01 | loss scale: 1.0 | grad norm: 0.948 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x555dedc5cf80] mmco: unref short failure +[h264 @ 0x555dedc5cf80] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x555dedc5cf80] mmco: unref short failure +[h264 @ 0x555dedc5cf80] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x555dedc5cf80] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x555dedc5cf80] mmco: unref short failure +[h264 @ 0x555dedc5cf80] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x555dedc5cf80] mmco: unref short failure +[h264 @ 0x555dedc5cf80] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure + [2024-11-28 11:31:31] iteration 870/ 1000 | consumed samples: 55680 | elapsed time per iteration (ms): 79923.1 | throughput per GPU (TFLOP/s/GPU): 96.4 | learning rate: 3.139705E-07 | global batch size: 64 | lm loss: 6.762735E-01 | loss scale: 1.0 | grad norm: 2.749 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555ded7e1040] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure + [2024-11-28 11:32:46] iteration 871/ 1000 | consumed samples: 55744 | elapsed time per iteration (ms): 75679.4 | throughput per GPU (TFLOP/s/GPU): 101.9 | learning rate: 3.107391E-07 | global batch size: 64 | lm loss: 6.117524E-01 | loss scale: 1.0 | grad norm: 0.852 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x55d9565f0700] mmco: unref short failure +[h264 @ 0x555df4961e80] mmco: unref short failure +[h264 @ 0x55d9568d0900] mmco: unref short failure +[h264 @ 0x55d9568d0900] mmco: unref short failure +[h264 @ 0x555dee700e80] mmco: unref short failure +[h264 @ 0x555dee700e80] mmco: unref short failure +[h264 @ 0x555dedc5cf80] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x555deee52400] mmco: unref short failure +[h264 @ 0x55d956386f00] mmco: unref short failure +[h264 @ 0x555deee52400] mmco: unref short failure +[h264 @ 0x55d956386f00] mmco: unref short failure +[h264 @ 0x555dedc5cf80] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure + [2024-11-28 11:34:09] iteration 872/ 1000 | consumed samples: 55808 | elapsed time per iteration (ms): 82728.9 | throughput per GPU (TFLOP/s/GPU): 93.2 | learning rate: 3.075313E-07 | global batch size: 64 | lm loss: 6.489487E-01 | loss scale: 1.0 | grad norm: 0.882 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x555ded7e1040] mmco: unref short failure +[h264 @ 0x555ded7e1040] mmco: unref short failure +[h264 @ 0x55d956defa80] mmco: unref short failure +[h264 @ 0x555df32c1f40] mmco: unref short failure +[h264 @ 0x55d956defa80] mmco: unref short failure +[h264 @ 0x55d956defa80] mmco: unref short failure +[h264 @ 0x555df32c1f40] mmco: unref short failure +[h264 @ 0x555df32c1f40] mmco: unref short failure +[h264 @ 0x55d956defa80] mmco: unref short failure +[h264 @ 0x555df32c1f40] mmco: unref short failure + [2024-11-28 11:35:54] iteration 873/ 1000 | consumed samples: 55872 | elapsed time per iteration (ms): 104669.5 | throughput per GPU (TFLOP/s/GPU): 73.6 | learning rate: 3.043469E-07 | global batch size: 64 | lm loss: 6.838205E-01 | loss scale: 1.0 | grad norm: 0.993 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555dedc5cf80] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x555dedc5cf80] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x555dedc5cf80] mmco: unref short failure +[h264 @ 0x555dedc5cf80] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x555dedc5cf80] mmco: unref short failure +[h264 @ 0x555dedc5cf80] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure + [2024-11-28 11:37:20] iteration 874/ 1000 | consumed samples: 55936 | elapsed time per iteration (ms): 86668.0 | throughput per GPU (TFLOP/s/GPU): 88.9 | learning rate: 3.011862E-07 | global batch size: 64 | lm loss: 6.566857E-01 | loss scale: 1.0 | grad norm: 0.933 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x555ded7e1040] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x555ded7e1040] mmco: unref short failure +[h264 @ 0x555ded7e1040] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x555ded7e1040] mmco: unref short failure +[h264 @ 0x555ded7e1040] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x555ded7e1040] mmco: unref short failure +[h264 @ 0x555ded7e1040] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x555ded7e1040] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x555ded7e1040] mmco: unref short failure +[h264 @ 0x555ded7e1040] mmco: unref short failure +[h264 @ 0x555ded7e1040] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure + [2024-11-28 11:38:50] iteration 875/ 1000 | consumed samples: 56000 | elapsed time per iteration (ms): 89306.3 | throughput per GPU (TFLOP/s/GPU): 86.3 | learning rate: 2.980490E-07 | global batch size: 64 | lm loss: 6.576104E-01 | loss scale: 1.0 | grad norm: 0.993 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555deee52400] mmco: unref short failure +[h264 @ 0x555deee52400] mmco: unref short failure +[h264 @ 0x55d95707f900] mmco: unref short failure +[h264 @ 0x55d95707f900] mmco: unref short failure +[h264 @ 0x55d9565f0700] mmco: unref short failure +[h264 @ 0x55d9565f0700] mmco: unref short failure +[h264 @ 0x555df4961e80] mmco: unref short failure +[h264 @ 0x555df4961e80] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x555ded7e1040] mmco: unref short failure +[h264 @ 0x555ded7e1040] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x555ded7e1040] mmco: unref short failure +[h264 @ 0x555ded7e1040] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x555ded7e1040] mmco: unref short failure +[h264 @ 0x555ded7e1040] mmco: unref short failure +[h264 @ 0x55d9565f0700] mmco: unref short failure +[h264 @ 0x555df4961e80] mmco: unref short failure +[h264 @ 0x55d9565f0700] mmco: unref short failure +[h264 @ 0x55d9565f0700] mmco: unref short failure +[h264 @ 0x555df4961e80] mmco: unref short failure +[h264 @ 0x555df4961e80] mmco: unref short failure +[h264 @ 0x55d9565f0700] mmco: unref short failure +[h264 @ 0x55d9565f0700] mmco: unref short failure +[h264 @ 0x555df4961e80] mmco: unref short failure +[h264 @ 0x555df4961e80] mmco: unref short failure +[h264 @ 0x55d9565f0700] mmco: unref short failure +[h264 @ 0x55d9565f0700] mmco: unref short failure +[h264 @ 0x555df4961e80] mmco: unref short failure +[h264 @ 0x555df4961e80] mmco: unref short failure +[h264 @ 0x555ded7e1040] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x555ded7e1040] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x555ded7e1040] mmco: unref short failure +[h264 @ 0x555ded7e1040] mmco: unref short failure +[h264 @ 0x555ded7e1040] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x555ded7e1040] mmco: unref short failure +[h264 @ 0x555df3e6f800] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x555ded7e1040] mmco: unref short failure +[h264 @ 0x555ded7e1040] mmco: unref short failure +[h264 @ 0x55d956386f00] mmco: unref short failure +[h264 @ 0x555df4961e80] mmco: unref short failure +[h264 @ 0x555df4961e80] mmco: unref short failure +[h264 @ 0x55d957883f80] mmco: unref short failure +[h264 @ 0x55d957883f80] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x555ded7e1040] mmco: unref short failure +[h264 @ 0x555ded7e1040] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x555ded7e1040] mmco: unref short failure +[h264 @ 0x555ded7e1040] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x555df3e6f800] mmco: unref short failure +[h264 @ 0x55d956386f00] mmco: unref short failure +[h264 @ 0x555df3e6f800] mmco: unref short failure +[h264 @ 0x555df3e6f800] mmco: unref short failure +[h264 @ 0x55d956386f00] mmco: unref short failure +[h264 @ 0x55d956386f00] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x555ded7e1040] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x555ded7e1040] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x555ded7e1040] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x555ded7e1040] mmco: unref short failure + [2024-11-28 11:40:21] iteration 876/ 1000 | consumed samples: 56064 | elapsed time per iteration (ms): 91021.0 | throughput per GPU (TFLOP/s/GPU): 84.7 | learning rate: 2.949354E-07 | global batch size: 64 | lm loss: 6.744531E-01 | loss scale: 1.0 | grad norm: 0.759 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555dee1d4480] mmco: unref short failure +[h264 @ 0x555dee1d4480] mmco: unref short failure +[h264 @ 0x55d95742d280] mmco: unref short failure +[h264 @ 0x55d95742d280] mmco: unref short failure +[h264 @ 0x555deee52400] mmco: unref short failure +[h264 @ 0x55d95707f900] mmco: unref short failure +[h264 @ 0x555ded7e1040] mmco: unref short failure +[h264 @ 0x555ded7e1040] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x555dee6ec240] mmco: unref short failure +[h264 @ 0x55d955b8bcc0] mmco: unref short failure + [2024-11-28 11:42:18] iteration 877/ 1000 | consumed samples: 56128 | elapsed time per iteration (ms): 117762.8 | throughput per GPU (TFLOP/s/GPU): 65.5 | learning rate: 2.918455E-07 | global batch size: 64 | lm loss: 6.744663E-01 | loss scale: 1.0 | grad norm: 1.002 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x55d9559dfc40] mmco: unref short failure +[h264 @ 0x55d9559dfc40] mmco: unref short failure +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x55d9559dfc40] mmco: unref short failure +[h264 @ 0x55d9559dfc40] mmco: unref short failure +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x55d9559dfc40] mmco: unref short failure +[h264 @ 0x55d9559dfc40] mmco: unref short failure + [2024-11-28 11:43:58] iteration 878/ 1000 | consumed samples: 56192 | elapsed time per iteration (ms): 99548.2 | throughput per GPU (TFLOP/s/GPU): 77.4 | learning rate: 2.887793E-07 | global batch size: 64 | lm loss: 6.407770E-01 | loss scale: 1.0 | grad norm: 0.719 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x55d95b718d00] mmco: unref short failure +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x555dee1d4480] mmco: unref short failure +[h264 @ 0x555dee1d4480] mmco: unref short failure +[h264 @ 0x55d9581bc7c0] mmco: unref short failure +[h264 @ 0x55d9581bc7c0] mmco: unref short failure + [2024-11-28 11:45:28] iteration 879/ 1000 | consumed samples: 56256 | elapsed time per iteration (ms): 89613.4 | throughput per GPU (TFLOP/s/GPU): 86.0 | learning rate: 2.857367E-07 | global batch size: 64 | lm loss: 7.363208E-01 | loss scale: 1.0 | grad norm: 0.866 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555df3e6f800] mmco: unref short failure +[h264 @ 0x55d95827cc40] mmco: unref short failure +[h264 @ 0x555df3e6f800] mmco: unref short failure +[h264 @ 0x555df3e6f800] mmco: unref short failure +[h264 @ 0x55d95827cc40] mmco: unref short failure +[h264 @ 0x55d95827cc40] mmco: unref short failure +[h264 @ 0x555df3e6f800] mmco: unref short failure +[h264 @ 0x555df3e6f800] mmco: unref short failure +[h264 @ 0x55d95827cc40] mmco: unref short failure +[h264 @ 0x55d95827cc40] mmco: unref short failure + [2024-11-28 11:47:22] iteration 880/ 1000 | consumed samples: 56320 | elapsed time per iteration (ms): 114021.1 | throughput per GPU (TFLOP/s/GPU): 67.6 | learning rate: 2.827180E-07 | global batch size: 64 | lm loss: 5.785704E-01 | loss scale: 1.0 | grad norm: 0.735 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x55d9559dfc40] mmco: unref short failure +[h264 @ 0x55d9559dfc40] mmco: unref short failure +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x55d9559dfc40] mmco: unref short failure +[h264 @ 0x55d9559dfc40] mmco: unref short failure +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x55d9559dfc40] mmco: unref short failure +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x55d9559dfc40] mmco: unref short failure +[h264 @ 0x55d9559dfc40] mmco: unref short failure +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x55d9559dfc40] mmco: unref short failure +[h264 @ 0x55d9559dfc40] mmco: unref short failure +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x55d9559dfc40] mmco: unref short failure +[h264 @ 0x55d9559dfc40] mmco: unref short failure +[h264 @ 0x55d9559dfc40] mmco: unref short failure +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x55d9559dfc40] mmco: unref short failure +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x55d9559dfc40] mmco: unref short failure +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x55d9559dfc40] mmco: unref short failure +[h264 @ 0x55d9559dfc40] mmco: unref short failure +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x55d9559dfc40] mmco: unref short failure +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x55d9559dfc40] mmco: unref short failure +[h264 @ 0x55d9559dfc40] mmco: unref short failure +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x555df4169bc0] mmco: unref short failure + [2024-11-28 11:48:44] iteration 881/ 1000 | consumed samples: 56384 | elapsed time per iteration (ms): 81971.8 | throughput per GPU (TFLOP/s/GPU): 94.0 | learning rate: 2.797230E-07 | global batch size: 64 | lm loss: 7.418721E-01 | loss scale: 1.0 | grad norm: 0.845 | number of skipped iterations: 0 | number of nan iterations: 0 | + [2024-11-28 11:50:13] iteration 882/ 1000 | consumed samples: 56448 | elapsed time per iteration (ms): 89729.7 | throughput per GPU (TFLOP/s/GPU): 85.9 | learning rate: 2.767519E-07 | global batch size: 64 | lm loss: 7.065222E-01 | loss scale: 1.0 | grad norm: 0.932 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x55d956ee5780] mmco: unref short failure +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x555dee570b40] mmco: unref short failure +[h264 @ 0x55d957fe7780] mmco: unref short failure +[h264 @ 0x55d956ee5780] mmco: unref short failure +[h264 @ 0x55d956ee5780] mmco: unref short failure +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x555dee570b40] mmco: unref short failure +[h264 @ 0x555dee570b40] mmco: unref short failure +[h264 @ 0x55d957fe7780] mmco: unref short failure +[h264 @ 0x55d957fe7780] mmco: unref short failure +[h264 @ 0x55d956ee5780] mmco: unref short failure +[h264 @ 0x55d956ee5780] mmco: unref short failure +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x555dee570b40] mmco: unref short failure +[h264 @ 0x55d957fe7780] mmco: unref short failure +[h264 @ 0x55d956ee5780] mmco: unref short failure +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x55d956ee5780] mmco: unref short failure +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x55d956ee5780] mmco: unref short failure +[h264 @ 0x55d956ee5780] mmco: unref short failure +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x55d95b718d00] mmco: unref short failure +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x55d95b718d00] mmco: unref short failure +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x55d956ee5780] mmco: unref short failure +[h264 @ 0x55d956ee5780] mmco: unref short failure +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x55d956ee5780] mmco: unref short failure +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x55d95b718d00] mmco: unref short failure +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x555ded7e1040] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x555ded7e1040] mmco: unref short failure +[h264 @ 0x55d956ee5780] mmco: unref short failure +[h264 @ 0x55d956ee5780] mmco: unref short failure +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x55d956ee5780] mmco: unref short failure +[h264 @ 0x55d956ee5780] mmco: unref short failure +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x55d956ee5780] mmco: unref short failure +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x55d957883f80] mmco: unref short failure +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x55d957883f80] mmco: unref short failure + [2024-11-28 11:51:29] iteration 883/ 1000 | consumed samples: 56512 | elapsed time per iteration (ms): 75093.3 | throughput per GPU (TFLOP/s/GPU): 102.7 | learning rate: 2.738045E-07 | global batch size: 64 | lm loss: 6.494349E-01 | loss scale: 1.0 | grad norm: 0.901 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x55d956ee5780] mmco: unref short failure + [2024-11-28 11:53:07] iteration 884/ 1000 | consumed samples: 56576 | elapsed time per iteration (ms): 98685.4 | throughput per GPU (TFLOP/s/GPU): 78.1 | learning rate: 2.708811E-07 | global batch size: 64 | lm loss: 6.827214E-01 | loss scale: 1.0 | grad norm: 0.803 | number of skipped iterations: 0 | number of nan iterations: 0 | + [2024-11-28 11:54:26] iteration 885/ 1000 | consumed samples: 56640 | elapsed time per iteration (ms): 78916.8 | throughput per GPU (TFLOP/s/GPU): 97.7 | learning rate: 2.679816E-07 | global batch size: 64 | lm loss: 6.591750E-01 | loss scale: 1.0 | grad norm: 2.252 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x555def251b40] mmco: unref short failure +[h264 @ 0x555def251b40] mmco: unref short failure +[h264 @ 0x555def251b40] mmco: unref short failure +[h264 @ 0x555def251b40] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x555def251b40] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure + [2024-11-28 11:56:03] iteration 886/ 1000 | consumed samples: 56704 | elapsed time per iteration (ms): 96961.1 | throughput per GPU (TFLOP/s/GPU): 79.5 | learning rate: 2.651060E-07 | global batch size: 64 | lm loss: 5.800773E-01 | loss scale: 1.0 | grad norm: 0.867 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555dee1d4480] mmco: unref short failure +[h264 @ 0x555dee1d4480] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x555dee1d4480] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x555dee1d4480] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x55d956f36840] mmco: unref short failure +[h264 @ 0x555deda84c00] mmco: unref short failure +[h264 @ 0x55d956f36840] mmco: unref short failure +[h264 @ 0x555deda84c00] mmco: unref short failure +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x55d95b718d00] mmco: unref short failure + [2024-11-28 11:57:57] iteration 887/ 1000 | consumed samples: 56768 | elapsed time per iteration (ms): 113772.5 | throughput per GPU (TFLOP/s/GPU): 67.8 | learning rate: 2.622543E-07 | global batch size: 64 | lm loss: 6.355096E-01 | loss scale: 1.0 | grad norm: 1.576 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555dee570b40] mmco: unref short failure +[h264 @ 0x555dee570b40] mmco: unref short failure +[h264 @ 0x55d957fe7780] mmco: unref short failure +[h264 @ 0x55d957fe7780] mmco: unref short failure +[h264 @ 0x55d956f36840] mmco: unref short failure +[h264 @ 0x55d956f36840] mmco: unref short failure +[h264 @ 0x555deda84c00] mmco: unref short failure +[h264 @ 0x555deda84c00] mmco: unref short failure +[h264 @ 0x55d956f36840] mmco: unref short failure +[h264 @ 0x555deda84c00] mmco: unref short failure +[h264 @ 0x555dee570b40] mmco: unref short failure +[h264 @ 0x55d957fe7780] mmco: unref short failure +[h264 @ 0x555dee570b40] mmco: unref short failure +[h264 @ 0x55d957fe7780] mmco: unref short failure +[h264 @ 0x55d956b88f40] mmco: unref short failure +[h264 @ 0x55d956b88f40] mmco: unref short failure +[h264 @ 0x555deee52400] mmco: unref short failure +[h264 @ 0x555deee52400] mmco: unref short failure +[h264 @ 0x55d956b88f40] mmco: unref short failure +[h264 @ 0x555deee52400] mmco: unref short failure + [2024-11-28 11:59:32] iteration 888/ 1000 | consumed samples: 56832 | elapsed time per iteration (ms): 95001.9 | throughput per GPU (TFLOP/s/GPU): 81.1 | learning rate: 2.594267E-07 | global batch size: 64 | lm loss: 6.524200E-01 | loss scale: 1.0 | grad norm: 0.843 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x55d955d53340] mmco: unref short failure +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x55d955d53340] mmco: unref short failure +[h264 @ 0x55d955d53340] mmco: unref short failure +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x55d955d53340] mmco: unref short failure +[h264 @ 0x55d955d53340] mmco: unref short failure +[h264 @ 0x55d955d53340] mmco: unref short failure +[h264 @ 0x55d955d53340] mmco: unref short failure +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x55d955d4f640] mmco: unref short failure +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x55d955d53340] mmco: unref short failure +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x55d955d53340] mmco: unref short failure +[h264 @ 0x55d955d53340] mmco: unref short failure +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x55d955d53340] mmco: unref short failure +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x55d95b718d00] mmco: unref short failure +[h264 @ 0x55d95b718d00] mmco: unref short failure +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x55d95b718d00] mmco: unref short failure +[h264 @ 0x55d95b718d00] mmco: unref short failure +[h264 @ 0x55d955d53340] mmco: unref short failure +[h264 @ 0x55d955d53340] mmco: unref short failure +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x55d95b718d00] mmco: unref short failure +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x55d95b718d00] mmco: unref short failure +[h264 @ 0x55d95b718d00] mmco: unref short failure +[h264 @ 0x55d955d53340] mmco: unref short failure +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x55d955d53340] mmco: unref short failure +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x55d955d53340] mmco: unref short failure +[h264 @ 0x55d955d53340] mmco: unref short failure +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x555df4169bc0] mmco: unref short failure + [2024-11-28 12:01:10] iteration 889/ 1000 | consumed samples: 56896 | elapsed time per iteration (ms): 98289.0 | throughput per GPU (TFLOP/s/GPU): 78.4 | learning rate: 2.566231E-07 | global batch size: 64 | lm loss: 6.437273E-01 | loss scale: 1.0 | grad norm: 1.347 | number of skipped iterations: 0 | number of nan iterations: 0 | + [2024-11-28 12:02:49] iteration 890/ 1000 | consumed samples: 56960 | elapsed time per iteration (ms): 99010.1 | throughput per GPU (TFLOP/s/GPU): 77.9 | learning rate: 2.538436E-07 | global batch size: 64 | lm loss: 6.941308E-01 | loss scale: 1.0 | grad norm: 0.917 | number of skipped iterations: 0 | number of nan iterations: 0 | + [2024-11-28 12:05:10] iteration 891/ 1000 | consumed samples: 57024 | elapsed time per iteration (ms): 140404.7 | throughput per GPU (TFLOP/s/GPU): 54.9 | learning rate: 2.510881E-07 | global batch size: 64 | lm loss: 6.219099E-01 | loss scale: 1.0 | grad norm: 1.027 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x55d959cd93c0] mmco: unref short failure +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x55d959cd93c0] mmco: unref short failure +[h264 @ 0x55d959cd93c0] mmco: unref short failure +[h264 @ 0x555df32c1f40] mmco: unref short failure +[h264 @ 0x555df32c1f40] mmco: unref short failure +[h264 @ 0x55d9568d0900] mmco: unref short failure +[h264 @ 0x55d9568d0900] mmco: unref short failure +[h264 @ 0x555df32c1f40] mmco: unref short failure +[h264 @ 0x555df32c1f40] mmco: unref short failure +[h264 @ 0x55d9568d0900] mmco: unref short failure +[h264 @ 0x55d9568d0900] mmco: unref short failure +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x55d959cd93c0] mmco: unref short failure +[h264 @ 0x55d959cd93c0] mmco: unref short failure + [2024-11-28 12:06:48] iteration 892/ 1000 | consumed samples: 57088 | elapsed time per iteration (ms): 98149.2 | throughput per GPU (TFLOP/s/GPU): 78.5 | learning rate: 2.483568E-07 | global batch size: 64 | lm loss: 6.992269E-01 | loss scale: 1.0 | grad norm: 1.027 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x555dee570b40] mmco: unref short failure +[h264 @ 0x55d955d53340] mmco: unref short failure +[h264 @ 0x55d955d53340] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x555dee570b40] mmco: unref short failure +[h264 @ 0x555dee570b40] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x555dee570b40] mmco: unref short failure + [2024-11-28 12:08:06] iteration 893/ 1000 | consumed samples: 57152 | elapsed time per iteration (ms): 77768.3 | throughput per GPU (TFLOP/s/GPU): 99.1 | learning rate: 2.456496E-07 | global batch size: 64 | lm loss: 6.428531E-01 | loss scale: 1.0 | grad norm: 0.837 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x55d9569edbc0] mmco: unref short failure +[h264 @ 0x55d9569edbc0] mmco: unref short failure +[h264 @ 0x555dee6ec240] mmco: unref short failure +[h264 @ 0x555dee6ec240] mmco: unref short failure +[h264 @ 0x55d9569edbc0] mmco: unref short failure +[h264 @ 0x555dee6ec240] mmco: unref short failure +[h264 @ 0x55d9569edbc0] mmco: unref short failure +[h264 @ 0x555dee6ec240] mmco: unref short failure +[h264 @ 0x555deee52400] mmco: unref short failure +[h264 @ 0x555deee52400] mmco: unref short failure +[h264 @ 0x555deee52400] mmco: unref short failure +[h264 @ 0x55d95b718d00] mmco: unref short failure +[h264 @ 0x55d95b718d00] mmco: unref short failure +[h264 @ 0x55d95b718d00] mmco: unref short failure +[h264 @ 0x555deee52400] mmco: unref short failure +[h264 @ 0x555deee52400] mmco: unref short failure +[h264 @ 0x55d95b718d00] mmco: unref short failure +[h264 @ 0x55d95b718d00] mmco: unref short failure + [2024-11-28 12:09:34] iteration 894/ 1000 | consumed samples: 57216 | elapsed time per iteration (ms): 88494.0 | throughput per GPU (TFLOP/s/GPU): 87.1 | learning rate: 2.429665E-07 | global batch size: 64 | lm loss: 6.883608E-01 | loss scale: 1.0 | grad norm: 0.856 | number of skipped iterations: 0 | number of nan iterations: 0 | + [2024-11-28 12:11:09] iteration 895/ 1000 | consumed samples: 57280 | elapsed time per iteration (ms): 95126.9 | throughput per GPU (TFLOP/s/GPU): 81.0 | learning rate: 2.403077E-07 | global batch size: 64 | lm loss: 6.512347E-01 | loss scale: 1.0 | grad norm: 0.857 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555dee104680] mmco: unref short failure +[h264 @ 0x555dee104680] mmco: unref short failure +[h264 @ 0x55d955e4acc0] mmco: unref short failure +[h264 @ 0x55d955e4acc0] mmco: unref short failure +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x55d959cd93c0] mmco: unref short failure +[h264 @ 0x555dee104680] mmco: unref short failure +[h264 @ 0x555dee104680] mmco: unref short failure +[h264 @ 0x55d955e4acc0] mmco: unref short failure +[h264 @ 0x55d955e4acc0] mmco: unref short failure +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x55d959cd93c0] mmco: unref short failure +[h264 @ 0x555dee104680] mmco: unref short failure +[h264 @ 0x55d955e4acc0] mmco: unref short failure +[h264 @ 0x555deee52400] mmco: unref short failure +[h264 @ 0x555deee52400] mmco: unref short failure +[h264 @ 0x555dee6ec240] mmco: unref short failure +[h264 @ 0x555dee6ec240] mmco: unref short failure +[h264 @ 0x55d95b718d00] mmco: unref short failure +[h264 @ 0x55d95b718d00] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x555deee52400] mmco: unref short failure +[h264 @ 0x55d95b718d00] mmco: unref short failure +[h264 @ 0x555deee52400] mmco: unref short failure +[h264 @ 0x555deee52400] mmco: unref short failure +[h264 @ 0x555deee52400] mmco: unref short failure +[h264 @ 0x55d95b718d00] mmco: unref short failure +[h264 @ 0x55d95b718d00] mmco: unref short failure +[h264 @ 0x55d95b718d00] mmco: unref short failure +[h264 @ 0x555deee52400] mmco: unref short failure +[h264 @ 0x55d95b718d00] mmco: unref short failure +[h264 @ 0x555dee6ec240] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x555dee6ec240] mmco: unref short failure +[h264 @ 0x555dee6ec240] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x555dee6ec240] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x555dee6ec240] mmco: unref short failure +[h264 @ 0x555dee6ec240] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x555ded7e1040] mmco: unref short failure +[h264 @ 0x555dec811980] mmco: unref short failure +[h264 @ 0x55d95b718d00] mmco: unref short failure +[h264 @ 0x55d9586ad200] mmco: unref short failure +[h264 @ 0x555ded7e1040] mmco: unref short failure +[h264 @ 0x55d9586ad200] mmco: unref short failure +[h264 @ 0x555dec811980] mmco: unref short failure +[h264 @ 0x555dec811980] mmco: unref short failure +[h264 @ 0x555dec811980] mmco: unref short failure +[h264 @ 0x55d95b718d00] mmco: unref short failure +[h264 @ 0x55d95b718d00] mmco: unref short failure +[h264 @ 0x55d95b718d00] mmco: unref short failure +[h264 @ 0x555dec811980] mmco: unref short failure +[h264 @ 0x55d95b718d00] mmco: unref short failure +[h264 @ 0x555dec811980] mmco: unref short failure +[h264 @ 0x55d95b718d00] mmco: unref short failure + [2024-11-28 12:12:38] iteration 896/ 1000 | consumed samples: 57344 | elapsed time per iteration (ms): 88535.9 | throughput per GPU (TFLOP/s/GPU): 87.1 | learning rate: 2.376731E-07 | global batch size: 64 | lm loss: 5.932364E-01 | loss scale: 1.0 | grad norm: 0.981 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555deee52400] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure + [2024-11-28 12:13:56] iteration 897/ 1000 | consumed samples: 57408 | elapsed time per iteration (ms): 78513.5 | throughput per GPU (TFLOP/s/GPU): 98.2 | learning rate: 2.350628E-07 | global batch size: 64 | lm loss: 6.852360E-01 | loss scale: 1.0 | grad norm: 1.502 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x55d959cd93c0] mmco: unref short failure +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x55d959cd93c0] mmco: unref short failure +[h264 @ 0x55d959cd93c0] mmco: unref short failure +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x55d959cd93c0] mmco: unref short failure +[h264 @ 0x555dee104680] mmco: unref short failure +[h264 @ 0x555dee104680] mmco: unref short failure +[h264 @ 0x55d955e4acc0] mmco: unref short failure +[h264 @ 0x55d955e4acc0] mmco: unref short failure +[h264 @ 0x555dee104680] mmco: unref short failure +[h264 @ 0x555dee104680] mmco: unref short failure +[h264 @ 0x555dee104680] mmco: unref short failure +[h264 @ 0x55d955e4acc0] mmco: unref short failure +[h264 @ 0x55d955e4acc0] mmco: unref short failure +[h264 @ 0x55d955e4acc0] mmco: unref short failure +[h264 @ 0x555dee104680] mmco: unref short failure +[h264 @ 0x555dee104680] mmco: unref short failure +[h264 @ 0x55d955e4acc0] mmco: unref short failure +[h264 @ 0x55d955e4acc0] mmco: unref short failure +[h264 @ 0x555dee104680] mmco: unref short failure +[h264 @ 0x55d955e4acc0] mmco: unref short failure +[h264 @ 0x555dee104680] mmco: unref short failure +[h264 @ 0x55d955e4acc0] mmco: unref short failure + [2024-11-28 12:15:22] iteration 898/ 1000 | consumed samples: 57472 | elapsed time per iteration (ms): 85905.6 | throughput per GPU (TFLOP/s/GPU): 89.7 | learning rate: 2.324767E-07 | global batch size: 64 | lm loss: 6.652462E-01 | loss scale: 1.0 | grad norm: 1.168 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x55d959cd93c0] mmco: unref short failure +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x55d959cd93c0] mmco: unref short failure +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x55d959cd93c0] mmco: unref short failure +[h264 @ 0x555df4169bc0] mmco: unref short failure + [2024-11-28 12:17:05] iteration 899/ 1000 | consumed samples: 57536 | elapsed time per iteration (ms): 102585.9 | throughput per GPU (TFLOP/s/GPU): 75.1 | learning rate: 2.299149E-07 | global batch size: 64 | lm loss: 6.818770E-01 | loss scale: 1.0 | grad norm: 0.756 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x555dee570b40] mmco: unref short failure +[h264 @ 0x555dee570b40] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x555dee570b40] mmco: unref short failure +[h264 @ 0x555dee570b40] mmco: unref short failure +[h264 @ 0x555ded7e1040] mmco: unref short failure +[h264 @ 0x555ded7e1040] mmco: unref short failure +[h264 @ 0x55d955d24640] mmco: unref short failure +[h264 @ 0x55d955d24640] mmco: unref short failure +[h264 @ 0x555dee6ec240] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure + [2024-11-28 12:18:22] iteration 900/ 1000 | consumed samples: 57600 | elapsed time per iteration (ms): 77580.8 | throughput per GPU (TFLOP/s/GPU): 99.4 | learning rate: 2.273775E-07 | global batch size: 64 | lm loss: 6.114725E-01 | loss scale: 1.0 | grad norm: 0.844 | number of skipped iterations: 0 | number of nan iterations: 0 | +(min, max) time across ranks (ms): + save-checkpoint ................................: (267077.58, 267077.97) +[h264 @ 0x555ded7e1040] mmco: unref short failure +[h264 @ 0x55d95d112a80] mmco: unref short failure +[h264 @ 0x555ded7e1040] mmco: unref short failure +[h264 @ 0x555ded7e1040] mmco: unref short failure +[h264 @ 0x55d95d112a80] mmco: unref short failure +[h264 @ 0x55d95d112a80] mmco: unref short failure +[h264 @ 0x555ded7e1040] mmco: unref short failure +[h264 @ 0x55d95d112a80] mmco: unref short failure + [2024-11-28 12:24:05] iteration 901/ 1000 | consumed samples: 57664 | elapsed time per iteration (ms): 75885.6 | throughput per GPU (TFLOP/s/GPU): 101.6 | learning rate: 2.248645E-07 | global batch size: 64 | lm loss: 6.980703E-01 | loss scale: 1.0 | grad norm: 0.968 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x555dee6ec240] mmco: unref short failure +[h264 @ 0x555dee6ec240] mmco: unref short failure +[h264 @ 0x555df4961e80] mmco: unref short failure +[h264 @ 0x55d957fe7780] mmco: unref short failure + [2024-11-28 12:25:27] iteration 902/ 1000 | consumed samples: 57728 | elapsed time per iteration (ms): 81488.6 | throughput per GPU (TFLOP/s/GPU): 94.6 | learning rate: 2.223758E-07 | global batch size: 64 | lm loss: 7.051459E-01 | loss scale: 1.0 | grad norm: 0.884 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x55d9567ebec0] mmco: unref short failure +[h264 @ 0x555ded7e1040] mmco: unref short failure +[h264 @ 0x55d9567ebec0] mmco: unref short failure +[h264 @ 0x55d9567ebec0] mmco: unref short failure +[h264 @ 0x555ded7e1040] mmco: unref short failure +[h264 @ 0x555ded7e1040] mmco: unref short failure +[h264 @ 0x55d9567ebec0] mmco: unref short failure +[h264 @ 0x55d9567ebec0] mmco: unref short failure +[h264 @ 0x555ded7e1040] mmco: unref short failure +[h264 @ 0x555ded7e1040] mmco: unref short failure +[h264 @ 0x55d9567ebec0] mmco: unref short failure +[h264 @ 0x555ded7e1040] mmco: unref short failure +[h264 @ 0x55d9567ebec0] mmco: unref short failure +[h264 @ 0x55d9567ebec0] mmco: unref short failure +[h264 @ 0x555ded7e1040] mmco: unref short failure +[h264 @ 0x555ded7e1040] mmco: unref short failure +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x55d956defa80] mmco: unref short failure +[h264 @ 0x55d956defa80] mmco: unref short failure +[h264 @ 0x555df4961e80] mmco: unref short failure +[h264 @ 0x555df4961e80] mmco: unref short failure +[h264 @ 0x55d957fe7780] mmco: unref short failure +[h264 @ 0x55d957fe7780] mmco: unref short failure +[h264 @ 0x555dee104680] mmco: unref short failure +[h264 @ 0x55d955e4acc0] mmco: unref short failure +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x55d959b34680] mmco: unref short failure +[h264 @ 0x55d959b34680] mmco: unref short failure + [2024-11-28 12:26:51] iteration 903/ 1000 | consumed samples: 57792 | elapsed time per iteration (ms): 84522.0 | throughput per GPU (TFLOP/s/GPU): 91.2 | learning rate: 2.199115E-07 | global batch size: 64 | lm loss: 6.813552E-01 | loss scale: 1.0 | grad norm: 1.164 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555dee104680] mmco: unref short failure +[h264 @ 0x555dee104680] mmco: unref short failure +[h264 @ 0x55d9581bc7c0] mmco: unref short failure +[h264 @ 0x55d9581bc7c0] mmco: unref short failure +[h264 @ 0x555dee104680] mmco: unref short failure +[h264 @ 0x55d9581bc7c0] mmco: unref short failure + [2024-11-28 12:28:06] iteration 904/ 1000 | consumed samples: 57856 | elapsed time per iteration (ms): 74846.7 | throughput per GPU (TFLOP/s/GPU): 103.0 | learning rate: 2.174717E-07 | global batch size: 64 | lm loss: 6.666945E-01 | loss scale: 1.0 | grad norm: 0.965 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555dee570b40] mmco: unref short failure +[h264 @ 0x555dee570b40] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure + [2024-11-28 12:29:16] iteration 905/ 1000 | consumed samples: 57920 | elapsed time per iteration (ms): 69661.1 | throughput per GPU (TFLOP/s/GPU): 110.7 | learning rate: 2.150564E-07 | global batch size: 64 | lm loss: 6.282266E-01 | loss scale: 1.0 | grad norm: 0.881 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555dee104680] mmco: unref short failure +[h264 @ 0x55d955e4acc0] mmco: unref short failure +[h264 @ 0x555dee104680] mmco: unref short failure +[h264 @ 0x555dee104680] mmco: unref short failure +[h264 @ 0x55d955e4acc0] mmco: unref short failure +[h264 @ 0x55d955e4acc0] mmco: unref short failure + [2024-11-28 12:30:29] iteration 906/ 1000 | consumed samples: 57984 | elapsed time per iteration (ms): 72981.1 | throughput per GPU (TFLOP/s/GPU): 105.6 | learning rate: 2.126655E-07 | global batch size: 64 | lm loss: 6.620821E-01 | loss scale: 1.0 | grad norm: 0.951 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555dee104680] mmco: unref short failure +[h264 @ 0x555dee104680] mmco: unref short failure +[h264 @ 0x55d956defa80] mmco: unref short failure +[h264 @ 0x55d956defa80] mmco: unref short failure +[h264 @ 0x555dee104680] mmco: unref short failure +[h264 @ 0x55d956defa80] mmco: unref short failure +[h264 @ 0x555dee104680] mmco: unref short failure +[h264 @ 0x55d956defa80] mmco: unref short failure +[h264 @ 0x55d9581bc7c0] mmco: unref short failure +[h264 @ 0x55d9581bc7c0] mmco: unref short failure +[h264 @ 0x555dedacf800] mmco: unref short failure +[h264 @ 0x555dedacf800] mmco: unref short failure +[h264 @ 0x555dee104680] mmco: unref short failure +[h264 @ 0x555dee104680] mmco: unref short failure +[h264 @ 0x55d956defa80] mmco: unref short failure +[h264 @ 0x55d956defa80] mmco: unref short failure +[h264 @ 0x555dee104680] mmco: unref short failure +[h264 @ 0x555dee104680] mmco: unref short failure +[h264 @ 0x55d956defa80] mmco: unref short failure +[h264 @ 0x55d956defa80] mmco: unref short failure +[h264 @ 0x555deda84c00] mmco: unref short failure +[h264 @ 0x555deda84c00] mmco: unref short failure +[h264 @ 0x55d9581bc7c0] mmco: unref short failure +[h264 @ 0x55d9581bc7c0] mmco: unref short failure +[h264 @ 0x55d956defa80] mmco: unref short failure +[h264 @ 0x555dee104680] mmco: unref short failure +[h264 @ 0x555dee104680] mmco: unref short failure +[h264 @ 0x55d955e4acc0] mmco: unref short failure + [2024-11-28 12:31:51] iteration 907/ 1000 | consumed samples: 58048 | elapsed time per iteration (ms): 82389.5 | throughput per GPU (TFLOP/s/GPU): 93.6 | learning rate: 2.102992E-07 | global batch size: 64 | lm loss: 6.396784E-01 | loss scale: 1.0 | grad norm: 1.041 | number of skipped iterations: 0 | number of nan iterations: 0 | + [2024-11-28 12:35:14] iteration 908/ 1000 | consumed samples: 58112 | elapsed time per iteration (ms): 202675.5 | throughput per GPU (TFLOP/s/GPU): 38.0 | learning rate: 2.079574E-07 | global batch size: 64 | lm loss: 6.770626E-01 | loss scale: 1.0 | grad norm: 0.866 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x55d956767080] mmco: unref short failure +[h264 @ 0x555df32c1f40] mmco: unref short failure +[h264 @ 0x555df32c1f40] mmco: unref short failure +[h264 @ 0x555df32c1f40] mmco: unref short failure +[h264 @ 0x55d956da5240] mmco: unref short failure +[h264 @ 0x55d956da5240] mmco: unref short failure +[h264 @ 0x555df32c1f40] mmco: unref short failure +[h264 @ 0x55d956da5240] mmco: unref short failure + [2024-11-28 12:36:40] iteration 909/ 1000 | consumed samples: 58176 | elapsed time per iteration (ms): 86037.5 | throughput per GPU (TFLOP/s/GPU): 89.6 | learning rate: 2.056402E-07 | global batch size: 64 | lm loss: 6.487489E-01 | loss scale: 1.0 | grad norm: 0.883 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555dee104680] mmco: unref short failure +[h264 @ 0x55d959911280] mmco: unref short failure +[h264 @ 0x555dee104680] mmco: unref short failure +[h264 @ 0x55d959911280] mmco: unref short failure +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x55d959b34680] mmco: unref short failure +[h264 @ 0x55d959b34680] mmco: unref short failure +[h264 @ 0x555df32c1f40] mmco: unref short failure +[h264 @ 0x55d956767080] mmco: unref short failure +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x55d959b34680] mmco: unref short failure +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x55d959b34680] mmco: unref short failure + [2024-11-28 12:38:12] iteration 910/ 1000 | consumed samples: 58240 | elapsed time per iteration (ms): 91373.7 | throughput per GPU (TFLOP/s/GPU): 84.4 | learning rate: 2.033476E-07 | global batch size: 64 | lm loss: 7.297677E-01 | loss scale: 1.0 | grad norm: 0.957 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555dedbd1740] mmco: unref short failure +[h264 @ 0x55d956f78600] mmco: unref short failure +[h264 @ 0x555dedbd1740] mmco: unref short failure +[h264 @ 0x55d956f78600] mmco: unref short failure +[h264 @ 0x555deda84c00] mmco: unref short failure +[h264 @ 0x55d9581bc7c0] mmco: unref short failure +[h264 @ 0x555deda84c00] mmco: unref short failure +[h264 @ 0x55d9581bc7c0] mmco: unref short failure +[h264 @ 0x555deda84c00] mmco: unref short failure +[h264 @ 0x55d9581bc7c0] mmco: unref short failure + [2024-11-28 12:39:31] iteration 911/ 1000 | consumed samples: 58304 | elapsed time per iteration (ms): 79254.7 | throughput per GPU (TFLOP/s/GPU): 97.3 | learning rate: 2.010795E-07 | global batch size: 64 | lm loss: 7.363273E-01 | loss scale: 1.0 | grad norm: 0.994 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555dee0da840] mmco: unref short failure +[h264 @ 0x55d956f78600] mmco: unref short failure +[h264 @ 0x555dee0da840] mmco: unref short failure +[h264 @ 0x55d956f78600] mmco: unref short failure +[h264 @ 0x555dee0da840] mmco: unref short failure +[h264 @ 0x555dee0da840] mmco: unref short failure +[h264 @ 0x55d956f78600] mmco: unref short failure +[h264 @ 0x55d956f78600] mmco: unref short failure +[h264 @ 0x555dee0da840] mmco: unref short failure +[h264 @ 0x55d956f78600] mmco: unref short failure + [2024-11-28 12:41:12] iteration 912/ 1000 | consumed samples: 58368 | elapsed time per iteration (ms): 100778.2 | throughput per GPU (TFLOP/s/GPU): 76.5 | learning rate: 1.988362E-07 | global batch size: 64 | lm loss: 6.452677E-01 | loss scale: 1.0 | grad norm: 1.218 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555dee104680] mmco: unref short failure +[h264 @ 0x55d959911280] mmco: unref short failure +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x55d959b34680] mmco: unref short failure +[h264 @ 0x55d959b34680] mmco: unref short failure +[h264 @ 0x555df32c1f40] mmco: unref short failure +[h264 @ 0x55d956ee5780] mmco: unref short failure +[h264 @ 0x555dee104680] mmco: unref short failure +[h264 @ 0x555dee104680] mmco: unref short failure +[h264 @ 0x55d959911280] mmco: unref short failure +[h264 @ 0x55d959911280] mmco: unref short failure +[h264 @ 0x55d956f78600] mmco: unref short failure +[h264 @ 0x555dee0da840] mmco: unref short failure +[h264 @ 0x55d959b34680] mmco: unref short failure +[h264 @ 0x55d959b34680] mmco: unref short failure +[h264 @ 0x555df32c1f40] mmco: unref short failure +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x55d956ee5780] mmco: unref short failure +[h264 @ 0x55d956ee5780] mmco: unref short failure +[h264 @ 0x55d956ee5780] mmco: unref short failure +[h264 @ 0x555dee104680] mmco: unref short failure +[h264 @ 0x555dee104680] mmco: unref short failure +[h264 @ 0x55d956ee5780] mmco: unref short failure +[h264 @ 0x55d956ee5780] mmco: unref short failure +[h264 @ 0x555df32c1f40] mmco: unref short failure +[h264 @ 0x555df32c1f40] mmco: unref short failure +[h264 @ 0x55d956ee5780] mmco: unref short failure +[h264 @ 0x555dee104680] mmco: unref short failure + [2024-11-28 12:42:40] iteration 913/ 1000 | consumed samples: 58432 | elapsed time per iteration (ms): 88495.8 | throughput per GPU (TFLOP/s/GPU): 87.1 | learning rate: 1.966174E-07 | global batch size: 64 | lm loss: 7.108324E-01 | loss scale: 1.0 | grad norm: 0.853 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555dee104680] mmco: unref short failure +[h264 @ 0x555dee104680] mmco: unref short failure +[h264 @ 0x55d955ee69c0] mmco: unref short failure +[h264 @ 0x55d955ee69c0] mmco: unref short failure +[h264 @ 0x555decea5e80] mmco: unref short failure +[h264 @ 0x555decea5e80] mmco: unref short failure +[h264 @ 0x55d956042040] mmco: unref short failure +[h264 @ 0x55d956042040] mmco: unref short failure +[h264 @ 0x555dee104680] mmco: unref short failure +[h264 @ 0x55d95d112a80] mmco: unref short failure +[h264 @ 0x55d959b34680] mmco: unref short failure +[h264 @ 0x55d959b34680] mmco: unref short failure +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x55d959b34680] mmco: unref short failure +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x555deda84c00] mmco: unref short failure +[h264 @ 0x555deda84c00] mmco: unref short failure +[h264 @ 0x55d956b9c3c0] mmco: unref short failure +[h264 @ 0x55d956b9c3c0] mmco: unref short failure +[h264 @ 0x555deda84c00] mmco: unref short failure +[h264 @ 0x55d956b9c3c0] mmco: unref short failure +[h264 @ 0x55d956b9c3c0] mmco: unref short failure +[h264 @ 0x55d956b9c3c0] mmco: unref short failure +[h264 @ 0x555deda84c00] mmco: unref short failure +[h264 @ 0x555deda84c00] mmco: unref short failure +[h264 @ 0x55d956b9c3c0] mmco: unref short failure +[h264 @ 0x55d956b9c3c0] mmco: unref short failure +[h264 @ 0x555deda84c00] mmco: unref short failure +[h264 @ 0x555deda84c00] mmco: unref short failure +[h264 @ 0x55d956b9c3c0] mmco: unref short failure +[h264 @ 0x55d956b9c3c0] mmco: unref short failure +[h264 @ 0x555deda84c00] mmco: unref short failure +[h264 @ 0x555deda84c00] mmco: unref short failure + [2024-11-28 12:43:57] iteration 914/ 1000 | consumed samples: 58496 | elapsed time per iteration (ms): 76541.2 | throughput per GPU (TFLOP/s/GPU): 100.7 | learning rate: 1.944234E-07 | global batch size: 64 | lm loss: 7.447613E-01 | loss scale: 1.0 | grad norm: 0.866 | number of skipped iterations: 0 | number of nan iterations: 0 | + [2024-11-28 12:45:20] iteration 915/ 1000 | consumed samples: 58560 | elapsed time per iteration (ms): 83460.2 | throughput per GPU (TFLOP/s/GPU): 92.4 | learning rate: 1.922541E-07 | global batch size: 64 | lm loss: 6.993719E-01 | loss scale: 1.0 | grad norm: 0.851 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555dee104680] mmco: unref short failure +[h264 @ 0x555dee104680] mmco: unref short failure +[h264 @ 0x55d958e05fc0] mmco: unref short failure +[h264 @ 0x55d958e05fc0] mmco: unref short failure +[h264 @ 0x555dee104680] mmco: unref short failure +[h264 @ 0x555dee104680] mmco: unref short failure +[h264 @ 0x55d958e05fc0] mmco: unref short failure +[h264 @ 0x55d958e05fc0] mmco: unref short failure +[h264 @ 0x555decb36640] mmco: unref short failure +[h264 @ 0x555decb36640] mmco: unref short failure +[h264 @ 0x55d956b88f40] mmco: unref short failure +[h264 @ 0x55d956b88f40] mmco: unref short failure +[h264 @ 0x555dee104680] mmco: unref short failure +[h264 @ 0x555dee104680] mmco: unref short failure +[h264 @ 0x55d958e05fc0] mmco: unref short failure +[h264 @ 0x55d958e05fc0] mmco: unref short failure + [2024-11-28 12:46:44] iteration 916/ 1000 | consumed samples: 58624 | elapsed time per iteration (ms): 83996.5 | throughput per GPU (TFLOP/s/GPU): 91.8 | learning rate: 1.901095E-07 | global batch size: 64 | lm loss: 6.727135E-01 | loss scale: 1.0 | grad norm: 0.955 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555decb36640] mmco: unref short failure +[h264 @ 0x555decb36640] mmco: unref short failure +[h264 @ 0x55d956b88f40] mmco: unref short failure +[h264 @ 0x55d956b88f40] mmco: unref short failure +[h264 @ 0x555dee104680] mmco: unref short failure +[h264 @ 0x55d956defa80] mmco: unref short failure +[h264 @ 0x555dec64f880] mmco: unref short failure +[h264 @ 0x55d956767080] mmco: unref short failure +[h264 @ 0x555dec64f880] mmco: unref short failure +[h264 @ 0x555dec64f880] mmco: unref short failure +[h264 @ 0x55d956767080] mmco: unref short failure +[h264 @ 0x55d956767080] mmco: unref short failure +[h264 @ 0x55d95707f900] mmco: unref short failure +[h264 @ 0x555ded70ae00] mmco: unref short failure +[h264 @ 0x55d95707f900] mmco: unref short failure +[h264 @ 0x555ded70ae00] mmco: unref short failure +[h264 @ 0x555dec64f880] mmco: unref short failure +[h264 @ 0x55d956767080] mmco: unref short failure +[h264 @ 0x55d95707f900] mmco: unref short failure +[h264 @ 0x55d95707f900] mmco: unref short failure +[h264 @ 0x555ded70ae00] mmco: unref short failure +[h264 @ 0x555ded70ae00] mmco: unref short failure +[h264 @ 0x55d95707f900] mmco: unref short failure +[h264 @ 0x55d95707f900] mmco: unref short failure +[h264 @ 0x555ded70ae00] mmco: unref short failure +[h264 @ 0x555ded70ae00] mmco: unref short failure +[h264 @ 0x555decb36640] mmco: unref short failure +[h264 @ 0x55d956b88f40] mmco: unref short failure +[h264 @ 0x555decb36640] mmco: unref short failure +[h264 @ 0x55d956b88f40] mmco: unref short failure +[h264 @ 0x555ded70ae00] mmco: unref short failure +[h264 @ 0x555ded70ae00] mmco: unref short failure +[h264 @ 0x55d95707f900] mmco: unref short failure +[h264 @ 0x55d95707f900] mmco: unref short failure + [2024-11-28 12:48:29] iteration 917/ 1000 | consumed samples: 58688 | elapsed time per iteration (ms): 104791.6 | throughput per GPU (TFLOP/s/GPU): 73.6 | learning rate: 1.879897E-07 | global batch size: 64 | lm loss: 6.296797E-01 | loss scale: 1.0 | grad norm: 0.687 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x55d956767080] mmco: unref short failure +[h264 @ 0x555dedf05880] mmco: unref short failure +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x55d956f36e40] mmco: unref short failure +[h264 @ 0x555dee6ec240] mmco: unref short failure +[h264 @ 0x555dee6ec240] mmco: unref short failure +[h264 @ 0x555dee6ec240] mmco: unref short failure +[h264 @ 0x555dee6ec240] mmco: unref short failure +[h264 @ 0x55d95873d100] mmco: unref short failure +[h264 @ 0x55d95873d100] mmco: unref short failure +[h264 @ 0x555dee6ec240] mmco: unref short failure +[h264 @ 0x555dee6ec240] mmco: unref short failure +[h264 @ 0x55d95873d100] mmco: unref short failure +[h264 @ 0x55d95873d100] mmco: unref short failure +[h264 @ 0x555dee6ec240] mmco: unref short failure +[h264 @ 0x555dee6ec240] mmco: unref short failure +[h264 @ 0x55d95873d100] mmco: unref short failure +[h264 @ 0x55d95873d100] mmco: unref short failure +[h264 @ 0x55d95873d100] mmco: unref short failure +[h264 @ 0x55d95873d100] mmco: unref short failure + [2024-11-28 12:49:50] iteration 918/ 1000 | consumed samples: 58752 | elapsed time per iteration (ms): 80806.4 | throughput per GPU (TFLOP/s/GPU): 95.4 | learning rate: 1.858946E-07 | global batch size: 64 | lm loss: 6.743851E-01 | loss scale: 1.0 | grad norm: 0.817 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x55d95707f900] mmco: unref short failure +[h264 @ 0x55d95707f900] mmco: unref short failure +[h264 @ 0x555ded70ae00] mmco: unref short failure +[h264 @ 0x555ded70ae00] mmco: unref short failure +[h264 @ 0x55d95707f900] mmco: unref short failure +[h264 @ 0x55d95707f900] mmco: unref short failure +[h264 @ 0x555ded70ae00] mmco: unref short failure +[h264 @ 0x555ded70ae00] mmco: unref short failure +[h264 @ 0x555ded70ae00] mmco: unref short failure +[h264 @ 0x555ded70ae00] mmco: unref short failure +[h264 @ 0x555ded70ae00] mmco: unref short failure +[h264 @ 0x555ded70ae00] mmco: unref short failure +[h264 @ 0x55d95707f900] mmco: unref short failure +[h264 @ 0x55d95707f900] mmco: unref short failure +[h264 @ 0x55d95707f900] mmco: unref short failure +[h264 @ 0x55d95707f900] mmco: unref short failure + [2024-11-28 12:51:03] iteration 919/ 1000 | consumed samples: 58816 | elapsed time per iteration (ms): 73359.1 | throughput per GPU (TFLOP/s/GPU): 105.1 | learning rate: 1.838244E-07 | global batch size: 64 | lm loss: 6.623120E-01 | loss scale: 1.0 | grad norm: 0.782 | number of skipped iterations: 0 | number of nan iterations: 0 | + [2024-11-28 12:52:39] iteration 920/ 1000 | consumed samples: 58880 | elapsed time per iteration (ms): 96155.9 | throughput per GPU (TFLOP/s/GPU): 80.2 | learning rate: 1.817789E-07 | global batch size: 64 | lm loss: 6.973625E-01 | loss scale: 1.0 | grad norm: 1.291 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x55d95742d280] mmco: unref short failure +[h264 @ 0x555ded70ae00] mmco: unref short failure +[h264 @ 0x55d95707f900] mmco: unref short failure +[h264 @ 0x555dec811980] mmco: unref short failure +[h264 @ 0x555dec811980] mmco: unref short failure +[h264 @ 0x555dec811980] mmco: unref short failure +[h264 @ 0x55d95707f900] mmco: unref short failure +[h264 @ 0x55d95707f900] mmco: unref short failure + [2024-11-28 12:54:10] iteration 921/ 1000 | consumed samples: 58944 | elapsed time per iteration (ms): 90212.1 | throughput per GPU (TFLOP/s/GPU): 85.4 | learning rate: 1.797583E-07 | global batch size: 64 | lm loss: 6.481697E-01 | loss scale: 1.0 | grad norm: 0.903 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x55d957f0f640] mmco: unref short failure +[h264 @ 0x55d957f0f640] mmco: unref short failure +[h264 @ 0x555dec811980] mmco: unref short failure +[h264 @ 0x55d95707f900] mmco: unref short failure + [2024-11-28 12:55:30] iteration 922/ 1000 | consumed samples: 59008 | elapsed time per iteration (ms): 80544.6 | throughput per GPU (TFLOP/s/GPU): 95.7 | learning rate: 1.777626E-07 | global batch size: 64 | lm loss: 6.707134E-01 | loss scale: 1.0 | grad norm: 0.896 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555dec73eb00] mmco: unref short failure +[h264 @ 0x555dec73eb00] mmco: unref short failure +[h264 @ 0x55d956767080] mmco: unref short failure +[h264 @ 0x55d956767080] mmco: unref short failure +[h264 @ 0x55d956aae040] mmco: unref short failure +[h264 @ 0x555dee104680] mmco: unref short failure +[h264 @ 0x555dee6ec240] mmco: unref short failure +[h264 @ 0x55d957a03080] mmco: unref short failure +[h264 @ 0x55d956ee5780] mmco: unref short failure +[h264 @ 0x555dee100dc0] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x55d959b34680] mmco: unref short failure + [2024-11-28 12:56:50] iteration 923/ 1000 | consumed samples: 59072 | elapsed time per iteration (ms): 79886.4 | throughput per GPU (TFLOP/s/GPU): 96.5 | learning rate: 1.757917E-07 | global batch size: 64 | lm loss: 6.061962E-01 | loss scale: 1.0 | grad norm: 0.873 | number of skipped iterations: 0 | number of nan iterations: 0 | + [2024-11-28 12:58:11] iteration 924/ 1000 | consumed samples: 59136 | elapsed time per iteration (ms): 80945.2 | throughput per GPU (TFLOP/s/GPU): 95.2 | learning rate: 1.738458E-07 | global batch size: 64 | lm loss: 6.526064E-01 | loss scale: 1.0 | grad norm: 1.038 | number of skipped iterations: 0 | number of nan iterations: 0 | + [2024-11-28 12:59:34] iteration 925/ 1000 | consumed samples: 59200 | elapsed time per iteration (ms): 83194.9 | throughput per GPU (TFLOP/s/GPU): 92.7 | learning rate: 1.719248E-07 | global batch size: 64 | lm loss: 6.461999E-01 | loss scale: 1.0 | grad norm: 0.801 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x55d9560849c0] mmco: unref short failure +[h264 @ 0x55d9560849c0] mmco: unref short failure +[h264 @ 0x555ded70ae00] mmco: unref short failure +[h264 @ 0x555ded70ae00] mmco: unref short failure +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x55d957a03080] mmco: unref short failure +[h264 @ 0x555dee6ec240] mmco: unref short failure +[h264 @ 0x555dee6ec240] mmco: unref short failure +[h264 @ 0x55d956f36e40] mmco: unref short failure +[h264 @ 0x55d956f36e40] mmco: unref short failure +[h264 @ 0x555defd60880] mmco: unref short failure +[h264 @ 0x55d955b8bcc0] mmco: unref short failure +[h264 @ 0x555defd60880] mmco: unref short failure +[h264 @ 0x55d955b8bcc0] mmco: unref short failure +[h264 @ 0x555dee100dc0] mmco: unref short failure +[h264 @ 0x555dee100dc0] mmco: unref short failure +[h264 @ 0x55d956d792c0] mmco: unref short failure +[h264 @ 0x55d956d792c0] mmco: unref short failure +[h264 @ 0x555dee100dc0] mmco: unref short failure +[h264 @ 0x55d956d792c0] mmco: unref short failure +[h264 @ 0x555dec73eb00] mmco: unref short failure +[h264 @ 0x55d95c76c1c0] mmco: unref short failure +[h264 @ 0x555dec73eb00] mmco: unref short failure +[h264 @ 0x555dec73eb00] mmco: unref short failure +[h264 @ 0x55d95c76c1c0] mmco: unref short failure +[h264 @ 0x55d95c76c1c0] mmco: unref short failure +[h264 @ 0x555dec73eb00] mmco: unref short failure +[h264 @ 0x55d95c76c1c0] mmco: unref short failure +[h264 @ 0x555dec73eb00] mmco: unref short failure +[h264 @ 0x555dec73eb00] mmco: unref short failure +[h264 @ 0x55d95c76c1c0] mmco: unref short failure +[h264 @ 0x55d95c76c1c0] mmco: unref short failure +[h264 @ 0x55d95c76c1c0] mmco: unref short failure +[h264 @ 0x55d95c76c1c0] mmco: unref short failure +[h264 @ 0x555dec836c00] mmco: unref short failure +[h264 @ 0x555dec836c00] mmco: unref short failure +[h264 @ 0x555dec73eb00] mmco: unref short failure +[h264 @ 0x555dec73eb00] mmco: unref short failure +[h264 @ 0x55d95c76c1c0] mmco: unref short failure +[h264 @ 0x55d95c76c1c0] mmco: unref short failure +[h264 @ 0x555dec73eb00] mmco: unref short failure +[h264 @ 0x55d95c76c1c0] mmco: unref short failure + [2024-11-28 13:00:55] iteration 926/ 1000 | consumed samples: 59264 | elapsed time per iteration (ms): 81298.6 | throughput per GPU (TFLOP/s/GPU): 94.8 | learning rate: 1.700287E-07 | global batch size: 64 | lm loss: 6.680710E-01 | loss scale: 1.0 | grad norm: 1.062 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x55d95c76c1c0] mmco: unref short failure +[h264 @ 0x555dedf05880] mmco: unref short failure +[h264 @ 0x555dedf05880] mmco: unref short failure +[h264 @ 0x555dedf05880] mmco: unref short failure +[h264 @ 0x55d95c76c1c0] mmco: unref short failure +[h264 @ 0x55d95c76c1c0] mmco: unref short failure +[h264 @ 0x555dedf05880] mmco: unref short failure +[h264 @ 0x555dedf05880] mmco: unref short failure +[h264 @ 0x55d95c76c1c0] mmco: unref short failure +[h264 @ 0x55d95c76c1c0] mmco: unref short failure + [2024-11-28 13:03:38] iteration 927/ 1000 | consumed samples: 59328 | elapsed time per iteration (ms): 162383.2 | throughput per GPU (TFLOP/s/GPU): 47.5 | learning rate: 1.681576E-07 | global batch size: 64 | lm loss: 6.349767E-01 | loss scale: 1.0 | grad norm: 0.828 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x55d9560849c0] mmco: unref short failure +[h264 @ 0x55d9560849c0] mmco: unref short failure +[h264 @ 0x555ded70ae00] mmco: unref short failure +[h264 @ 0x555ded70ae00] mmco: unref short failure +[h264 @ 0x55d956f36840] mmco: unref short failure +[h264 @ 0x555df32c1f40] mmco: unref short failure +[h264 @ 0x55d957a03080] mmco: unref short failure +[h264 @ 0x55d957a03080] mmco: unref short failure +[h264 @ 0x555dee6ec240] mmco: unref short failure +[h264 @ 0x555dee6ec240] mmco: unref short failure +[h264 @ 0x555dee6ec240] mmco: unref short failure +[h264 @ 0x555dee6ec240] mmco: unref short failure +[h264 @ 0x55d956f36e40] mmco: unref short failure +[h264 @ 0x55d956f36e40] mmco: unref short failure + [2024-11-28 13:05:00] iteration 928/ 1000 | consumed samples: 59392 | elapsed time per iteration (ms): 82192.0 | throughput per GPU (TFLOP/s/GPU): 93.8 | learning rate: 1.663114E-07 | global batch size: 64 | lm loss: 7.176995E-01 | loss scale: 1.0 | grad norm: 0.912 | number of skipped iterations: 0 | number of nan iterations: 0 | + [2024-11-28 13:06:59] iteration 929/ 1000 | consumed samples: 59456 | elapsed time per iteration (ms): 118896.2 | throughput per GPU (TFLOP/s/GPU): 64.8 | learning rate: 1.644903E-07 | global batch size: 64 | lm loss: 7.511212E-01 | loss scale: 1.0 | grad norm: 1.079 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555dec836c00] mmco: unref short failure +[h264 @ 0x555dec836c00] mmco: unref short failure +[h264 @ 0x555dec836c00] mmco: unref short failure +[h264 @ 0x55d95c76c1c0] mmco: unref short failure +[h264 @ 0x55d95c76c1c0] mmco: unref short failure +[h264 @ 0x555dee100dc0] mmco: unref short failure +[h264 @ 0x555dee100dc0] mmco: unref short failure +[h264 @ 0x55d95c76c1c0] mmco: unref short failure +[h264 @ 0x555def9a5ac0] mmco: unref short failure +[h264 @ 0x555def9a5ac0] mmco: unref short failure +[h264 @ 0x55d95742d280] mmco: unref short failure +[h264 @ 0x55d95742d280] mmco: unref short failure +[h264 @ 0x55d95c76c1c0] mmco: unref short failure +[h264 @ 0x55d95c76c1c0] mmco: unref short failure +[h264 @ 0x555dee100dc0] mmco: unref short failure +[h264 @ 0x555dee100dc0] mmco: unref short failure +[h264 @ 0x55d95742d280] mmco: unref short failure +[h264 @ 0x55d95742d280] mmco: unref short failure + [2024-11-28 13:09:17] iteration 930/ 1000 | consumed samples: 59520 | elapsed time per iteration (ms): 137974.5 | throughput per GPU (TFLOP/s/GPU): 55.9 | learning rate: 1.626942E-07 | global batch size: 64 | lm loss: 6.085041E-01 | loss scale: 1.0 | grad norm: 1.197 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x55d957f0f640] mmco: unref short failure +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x55d957f0f640] mmco: unref short failure +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x555dee6ec240] mmco: unref short failure +[h264 @ 0x555dee6ec240] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x555dee6ec240] mmco: unref short failure +[h264 @ 0x555dee6ec240] mmco: unref short failure +[h264 @ 0x55d957a03080] mmco: unref short failure +[h264 @ 0x55d957a03080] mmco: unref short failure +[h264 @ 0x55d957a03080] mmco: unref short failure +[h264 @ 0x55d957a03080] mmco: unref short failure +[h264 @ 0x555dec836c00] mmco: unref short failure +[h264 @ 0x55d95c76c1c0] mmco: unref short failure +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x55d957f0f640] mmco: unref short failure + [2024-11-28 13:11:09] iteration 931/ 1000 | consumed samples: 59584 | elapsed time per iteration (ms): 112172.9 | throughput per GPU (TFLOP/s/GPU): 68.7 | learning rate: 1.609232E-07 | global batch size: 64 | lm loss: 6.549844E-01 | loss scale: 1.0 | grad norm: 0.860 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555dedf05880] mmco: unref short failure +[h264 @ 0x555dedf05880] mmco: unref short failure +[h264 @ 0x55d95707f900] mmco: unref short failure +[h264 @ 0x55d95707f900] mmco: unref short failure +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x55d9560849c0] mmco: unref short failure +[h264 @ 0x55d9560849c0] mmco: unref short failure +[h264 @ 0x555ded70ae00] mmco: unref short failure +[h264 @ 0x555ded70ae00] mmco: unref short failure +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x55d957f0f640] mmco: unref short failure +[h264 @ 0x55d957f0f640] mmco: unref short failure +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x555dec836c00] mmco: unref short failure +[h264 @ 0x55d95c76c1c0] mmco: unref short failure + [2024-11-28 13:12:27] iteration 932/ 1000 | consumed samples: 59648 | elapsed time per iteration (ms): 77561.0 | throughput per GPU (TFLOP/s/GPU): 99.4 | learning rate: 1.591772E-07 | global batch size: 64 | lm loss: 6.965685E-01 | loss scale: 1.0 | grad norm: 0.962 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555dee6ec240] mmco: unref short failure +[h264 @ 0x555dee6ec240] mmco: unref short failure +[h264 @ 0x555dec836c00] mmco: unref short failure +[h264 @ 0x555dec836c00] mmco: unref short failure +[h264 @ 0x555dec836c00] mmco: unref short failure +[h264 @ 0x55d957a03080] mmco: unref short failure +[h264 @ 0x55d957a03080] mmco: unref short failure +[h264 @ 0x55d95c76c1c0] mmco: unref short failure +[h264 @ 0x55d95c76c1c0] mmco: unref short failure +[h264 @ 0x55d95c76c1c0] mmco: unref short failure +[h264 @ 0x555dec836c00] mmco: unref short failure +[h264 @ 0x555dec836c00] mmco: unref short failure +[h264 @ 0x55d95c76c1c0] mmco: unref short failure +[h264 @ 0x55d95c76c1c0] mmco: unref short failure +[h264 @ 0x555dec836c00] mmco: unref short failure +[h264 @ 0x55d95c76c1c0] mmco: unref short failure +[h264 @ 0x555dee6ec240] mmco: unref short failure +[h264 @ 0x555dee6ec240] mmco: unref short failure +[h264 @ 0x55d957a03080] mmco: unref short failure +[h264 @ 0x55d957a03080] mmco: unref short failure + [2024-11-28 13:14:04] iteration 933/ 1000 | consumed samples: 59712 | elapsed time per iteration (ms): 97431.5 | throughput per GPU (TFLOP/s/GPU): 79.1 | learning rate: 1.574562E-07 | global batch size: 64 | lm loss: 7.647378E-01 | loss scale: 1.0 | grad norm: 0.868 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555ded70ae00] mmco: unref short failure +[h264 @ 0x555ded70ae00] mmco: unref short failure +[h264 @ 0x55d9560849c0] mmco: unref short failure +[h264 @ 0x55d9560849c0] mmco: unref short failure + [2024-11-28 13:15:25] iteration 934/ 1000 | consumed samples: 59776 | elapsed time per iteration (ms): 81194.6 | throughput per GPU (TFLOP/s/GPU): 94.9 | learning rate: 1.557604E-07 | global batch size: 64 | lm loss: 6.471760E-01 | loss scale: 1.0 | grad norm: 0.905 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x55d957f0f640] mmco: unref short failure +[h264 @ 0x55d957f0f640] mmco: unref short failure +[h264 @ 0x555dec836c00] mmco: unref short failure +[h264 @ 0x555dec836c00] mmco: unref short failure +[h264 @ 0x55d95c76c1c0] mmco: unref short failure +[h264 @ 0x55d95c76c1c0] mmco: unref short failure +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x555dec5cd500] mmco: unref short failure +[h264 @ 0x555dec5cd500] mmco: unref short failure +[h264 @ 0x55d959b29f40] mmco: unref short failure +[h264 @ 0x55d959b29f40] mmco: unref short failure +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x55d959b29f40] mmco: unref short failure +[h264 @ 0x555dec5cd500] mmco: unref short failure +[h264 @ 0x55d959b29f40] mmco: unref short failure +[h264 @ 0x55d959b29f40] mmco: unref short failure +[h264 @ 0x555dec5cd500] mmco: unref short failure +[h264 @ 0x555dec5cd500] mmco: unref short failure +[h264 @ 0x55d955d24640] mmco: unref short failure +[h264 @ 0x55d955d24640] mmco: unref short failure +[h264 @ 0x555dedbd1740] mmco: unref short failure +[h264 @ 0x555dedbd1740] mmco: unref short failure +[h264 @ 0x55d955d24640] mmco: unref short failure +[h264 @ 0x55d955d24640] mmco: unref short failure +[h264 @ 0x555dedbd1740] mmco: unref short failure +[h264 @ 0x555dedbd1740] mmco: unref short failure +[h264 @ 0x555ded696c00] mmco: unref short failure +[h264 @ 0x555ded696c00] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x555ded696c00] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x555ded696c00] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure + [2024-11-28 13:16:41] iteration 935/ 1000 | consumed samples: 59840 | elapsed time per iteration (ms): 75355.6 | throughput per GPU (TFLOP/s/GPU): 102.3 | learning rate: 1.540897E-07 | global batch size: 64 | lm loss: 6.300914E-01 | loss scale: 1.0 | grad norm: 0.981 | number of skipped iterations: 0 | number of nan iterations: 0 | + [2024-11-28 13:19:26] iteration 936/ 1000 | consumed samples: 59904 | elapsed time per iteration (ms): 164799.3 | throughput per GPU (TFLOP/s/GPU): 46.8 | learning rate: 1.524441E-07 | global batch size: 64 | lm loss: 6.399311E-01 | loss scale: 1.0 | grad norm: 0.872 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555dedf05880] mmco: unref short failure +[h264 @ 0x555dedf05880] mmco: unref short failure +[h264 @ 0x55d95707f900] mmco: unref short failure +[h264 @ 0x55d95707f900] mmco: unref short failure +[h264 @ 0x555dee588f00] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x555dee100dc0] mmco: unref short failure +[h264 @ 0x555dee100dc0] mmco: unref short failure +[h264 @ 0x55d956da5240] mmco: unref short failure +[h264 @ 0x55d956da5240] mmco: unref short failure +[h264 @ 0x555dee100dc0] mmco: unref short failure +[h264 @ 0x55d956da5240] mmco: unref short failure +[h264 @ 0x555dee100dc0] mmco: unref short failure +[h264 @ 0x555dee100dc0] mmco: unref short failure +[h264 @ 0x55d956da5240] mmco: unref short failure +[h264 @ 0x55d956da5240] mmco: unref short failure +[h264 @ 0x55d956da5240] mmco: unref short failure +[h264 @ 0x555dee100dc0] mmco: unref short failure +[h264 @ 0x555dee588f00] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x555df333fac0] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure + [2024-11-28 13:21:03] iteration 937/ 1000 | consumed samples: 59968 | elapsed time per iteration (ms): 97316.7 | throughput per GPU (TFLOP/s/GPU): 79.2 | learning rate: 1.508237E-07 | global batch size: 64 | lm loss: 6.114424E-01 | loss scale: 1.0 | grad norm: 0.949 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555dec73eb00] mmco: unref short failure +[h264 @ 0x55d95707f900] mmco: unref short failure +[h264 @ 0x55d955d24640] mmco: unref short failure +[h264 @ 0x55d955d24640] mmco: unref short failure +[h264 @ 0x555dedb7a200] mmco: unref short failure +[h264 @ 0x555dedb7a200] mmco: unref short failure +[h264 @ 0x55d955d24640] mmco: unref short failure +[h264 @ 0x555dedb7a200] mmco: unref short failure +[h264 @ 0x555dedf05880] mmco: unref short failure +[h264 @ 0x55d95707f900] mmco: unref short failure + [2024-11-28 13:22:34] iteration 938/ 1000 | consumed samples: 60032 | elapsed time per iteration (ms): 90834.0 | throughput per GPU (TFLOP/s/GPU): 84.9 | learning rate: 1.492284E-07 | global batch size: 64 | lm loss: 7.345959E-01 | loss scale: 1.0 | grad norm: 1.041 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555dece9f180] mmco: unref short failure +[h264 @ 0x555dece9f180] mmco: unref short failure +[h264 @ 0x55d9581bc7c0] mmco: unref short failure +[h264 @ 0x55d9581bc7c0] mmco: unref short failure +[h264 @ 0x555dece9f180] mmco: unref short failure +[h264 @ 0x55d95678a600] mmco: unref short failure +[h264 @ 0x555dece9f180] mmco: unref short failure +[h264 @ 0x555dece9f180] mmco: unref short failure +[h264 @ 0x55d95678a600] mmco: unref short failure +[h264 @ 0x55d95678a600] mmco: unref short failure + [2024-11-28 13:24:27] iteration 939/ 1000 | consumed samples: 60096 | elapsed time per iteration (ms): 113150.1 | throughput per GPU (TFLOP/s/GPU): 68.1 | learning rate: 1.476583E-07 | global batch size: 64 | lm loss: 6.628835E-01 | loss scale: 1.0 | grad norm: 0.851 | number of skipped iterations: 0 | number of nan iterations: 0 | + [2024-11-28 13:25:49] iteration 940/ 1000 | consumed samples: 60160 | elapsed time per iteration (ms): 81984.3 | throughput per GPU (TFLOP/s/GPU): 94.0 | learning rate: 1.461135E-07 | global batch size: 64 | lm loss: 6.852534E-01 | loss scale: 1.0 | grad norm: 0.761 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x55d956defa80] mmco: unref short failure +[h264 @ 0x55d956defa80] mmco: unref short failure +[h264 @ 0x555dee6ec240] mmco: unref short failure +[h264 @ 0x555dee6ec240] mmco: unref short failure +[h264 @ 0x55d956defa80] mmco: unref short failure +[h264 @ 0x55d956defa80] mmco: unref short failure +[h264 @ 0x555dee6ec240] mmco: unref short failure +[h264 @ 0x555dee6ec240] mmco: unref short failure +[h264 @ 0x555df4961e80] mmco: unref short failure +[h264 @ 0x555df4961e80] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x55d956defa80] mmco: unref short failure +[h264 @ 0x55d956defa80] mmco: unref short failure +[h264 @ 0x555dee6ec240] mmco: unref short failure +[h264 @ 0x555dee6ec240] mmco: unref short failure +[h264 @ 0x555dedb7a200] mmco: unref short failure +[h264 @ 0x55d957a03080] mmco: unref short failure +[h264 @ 0x555dee570b40] mmco: unref short failure +[h264 @ 0x555dedb7a200] mmco: unref short failure +[h264 @ 0x555dedb7a200] mmco: unref short failure +[h264 @ 0x55d957a03080] mmco: unref short failure +[h264 @ 0x55d957a03080] mmco: unref short failure +[h264 @ 0x55d956ee5780] mmco: unref short failure +[h264 @ 0x555dedb7a200] mmco: unref short failure +[h264 @ 0x555dedb7a200] mmco: unref short failure +[h264 @ 0x55d957a03080] mmco: unref short failure +[h264 @ 0x55d957a03080] mmco: unref short failure +[h264 @ 0x555dedb7a200] mmco: unref short failure +[h264 @ 0x55d957a03080] mmco: unref short failure +[h264 @ 0x555dedb7a200] mmco: unref short failure +[h264 @ 0x555dedb7a200] mmco: unref short failure +[h264 @ 0x55d957a03080] mmco: unref short failure +[h264 @ 0x55d957a03080] mmco: unref short failure +[h264 @ 0x555dedb7a200] mmco: unref short failure +[h264 @ 0x555dedb7a200] mmco: unref short failure +[h264 @ 0x55d957a03080] mmco: unref short failure +[h264 @ 0x55d957a03080] mmco: unref short failure +[h264 @ 0x555dedb7a200] mmco: unref short failure +[h264 @ 0x55d957a03080] mmco: unref short failure + [2024-11-28 13:27:03] iteration 941/ 1000 | consumed samples: 60224 | elapsed time per iteration (ms): 74320.8 | throughput per GPU (TFLOP/s/GPU): 103.7 | learning rate: 1.445938E-07 | global batch size: 64 | lm loss: 7.116451E-01 | loss scale: 1.0 | grad norm: 0.812 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555dede47780] mmco: unref short failure +[h264 @ 0x555dede47780] mmco: unref short failure +[h264 @ 0x55d95678a600] mmco: unref short failure +[h264 @ 0x55d95678a600] mmco: unref short failure + [2024-11-28 13:28:27] iteration 942/ 1000 | consumed samples: 60288 | elapsed time per iteration (ms): 83413.5 | throughput per GPU (TFLOP/s/GPU): 92.4 | learning rate: 1.430994E-07 | global batch size: 64 | lm loss: 6.267360E-01 | loss scale: 1.0 | grad norm: 0.787 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x55d9586ad200] mmco: unref short failure +[h264 @ 0x55d9586ad200] mmco: unref short failure +[h264 @ 0x555dede47780] mmco: unref short failure +[h264 @ 0x555dede47780] mmco: unref short failure + [2024-11-28 13:29:47] iteration 943/ 1000 | consumed samples: 60352 | elapsed time per iteration (ms): 80788.2 | throughput per GPU (TFLOP/s/GPU): 95.4 | learning rate: 1.416302E-07 | global batch size: 64 | lm loss: 6.856556E-01 | loss scale: 1.0 | grad norm: 0.914 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555dee588f00] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x555dee588f00] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x555dee588f00] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x555dee588f00] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x555dee588f00] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x555dee588f00] mmco: unref short failure +[h264 @ 0x555dee588f00] mmco: unref short failure +[h264 @ 0x555dee588f00] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x555df32c1f40] mmco: unref short failure +[h264 @ 0x55d957cd6140] mmco: unref short failure +[h264 @ 0x555dede47780] mmco: unref short failure +[h264 @ 0x555dede47780] mmco: unref short failure +[h264 @ 0x55d9586ad200] mmco: unref short failure +[h264 @ 0x55d9586ad200] mmco: unref short failure +[h264 @ 0x555df32c1f40] mmco: unref short failure +[h264 @ 0x55d957cd6140] mmco: unref short failure + [2024-11-28 13:30:58] iteration 944/ 1000 | consumed samples: 60416 | elapsed time per iteration (ms): 70071.0 | throughput per GPU (TFLOP/s/GPU): 110.0 | learning rate: 1.401863E-07 | global batch size: 64 | lm loss: 7.190543E-01 | loss scale: 1.0 | grad norm: 1.094 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555df4961e80] mmco: unref short failure +[h264 @ 0x555dee588f00] mmco: unref short failure +[h264 @ 0x555dee588f00] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x555df4961e80] mmco: unref short failure +[h264 @ 0x555df4961e80] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x555df4961e80] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x555dee588f00] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x555ded696c00] mmco: unref short failure +[h264 @ 0x555ded696c00] mmco: unref short failure +[h264 @ 0x55d956ee5780] mmco: unref short failure +[h264 @ 0x55d956ee5780] mmco: unref short failure +[h264 @ 0x555ded696c00] mmco: unref short failure +[h264 @ 0x55d956ee5780] mmco: unref short failure +[h264 @ 0x555df32c1f40] mmco: unref short failure +[h264 @ 0x55d956ee5780] mmco: unref short failure +[h264 @ 0x555df32c1f40] mmco: unref short failure +[h264 @ 0x555df32c1f40] mmco: unref short failure +[h264 @ 0x55d956ee5780] mmco: unref short failure +[h264 @ 0x55d956ee5780] mmco: unref short failure + [2024-11-28 13:32:32] iteration 945/ 1000 | consumed samples: 60480 | elapsed time per iteration (ms): 94661.8 | throughput per GPU (TFLOP/s/GPU): 81.4 | learning rate: 1.387676E-07 | global batch size: 64 | lm loss: 6.541150E-01 | loss scale: 1.0 | grad norm: 0.924 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555ded696c00] mmco: unref short failure +[h264 @ 0x55d956386f00] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x55d957a03080] mmco: unref short failure +[h264 @ 0x55d957a03080] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x55d957a03080] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x555dedc46200] mmco: unref short failure +[h264 @ 0x55d957a03080] mmco: unref short failure +[h264 @ 0x55d957a03080] mmco: unref short failure +[h264 @ 0x55d957a03080] mmco: unref short failure + [2024-11-28 13:33:54] iteration 946/ 1000 | consumed samples: 60544 | elapsed time per iteration (ms): 81568.8 | throughput per GPU (TFLOP/s/GPU): 94.5 | learning rate: 1.373743E-07 | global batch size: 64 | lm loss: 6.668899E-01 | loss scale: 1.0 | grad norm: 0.923 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x55d9586ad200] mmco: unref short failure +[h264 @ 0x555dece4d600] mmco: unref short failure +[h264 @ 0x555dece4d600] mmco: unref short failure +[h264 @ 0x55d95c76c1c0] mmco: unref short failure +[h264 @ 0x55d95c76c1c0] mmco: unref short failure +[h264 @ 0x555dece4d600] mmco: unref short failure +[h264 @ 0x555dece4d600] mmco: unref short failure +[h264 @ 0x55d95c76c1c0] mmco: unref short failure +[h264 @ 0x55d95c76c1c0] mmco: unref short failure + [2024-11-28 13:35:17] iteration 947/ 1000 | consumed samples: 60608 | elapsed time per iteration (ms): 83452.5 | throughput per GPU (TFLOP/s/GPU): 92.4 | learning rate: 1.360062E-07 | global batch size: 64 | lm loss: 7.081062E-01 | loss scale: 1.0 | grad norm: 0.961 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x55d956da5240] mmco: unref short failure +[h264 @ 0x55d956da5240] mmco: unref short failure +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x555df4169bc0] mmco: unref short failure + [2024-11-28 13:36:37] iteration 948/ 1000 | consumed samples: 60672 | elapsed time per iteration (ms): 79886.5 | throughput per GPU (TFLOP/s/GPU): 96.5 | learning rate: 1.346635E-07 | global batch size: 64 | lm loss: 6.683935E-01 | loss scale: 1.0 | grad norm: 0.874 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555deee52400] mmco: unref short failure +[h264 @ 0x55d95c76c1c0] mmco: unref short failure +[h264 @ 0x555deee52400] mmco: unref short failure +[h264 @ 0x555deee52400] mmco: unref short failure +[h264 @ 0x55d95c76c1c0] mmco: unref short failure +[h264 @ 0x55d95c76c1c0] mmco: unref short failure +[h264 @ 0x555deee52400] mmco: unref short failure +[h264 @ 0x55d95c76c1c0] mmco: unref short failure +[h264 @ 0x55d95873d100] mmco: unref short failure +[h264 @ 0x55d95873d100] mmco: unref short failure +[h264 @ 0x555dee700e80] mmco: unref short failure +[h264 @ 0x555dee700e80] mmco: unref short failure +[h264 @ 0x55d95873d100] mmco: unref short failure +[h264 @ 0x555dee700e80] mmco: unref short failure + [2024-11-28 13:37:56] iteration 949/ 1000 | consumed samples: 60736 | elapsed time per iteration (ms): 79185.2 | throughput per GPU (TFLOP/s/GPU): 97.3 | learning rate: 1.333461E-07 | global batch size: 64 | lm loss: 7.324390E-01 | loss scale: 1.0 | grad norm: 0.917 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x555df4961e80] mmco: unref short failure +[h264 @ 0x555df32c1f40] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x555df4961e80] mmco: unref short failure +[h264 @ 0x555df4961e80] mmco: unref short failure +[h264 @ 0x555df4961e80] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x555df4961e80] mmco: unref short failure +[h264 @ 0x555df4961e80] mmco: unref short failure +[h264 @ 0x555df4961e80] mmco: unref short failure +[h264 @ 0x555df4961e80] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x555df4961e80] mmco: unref short failure +[h264 @ 0x555df4961e80] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure +[h264 @ 0x55d957fc08c0] mmco: unref short failure + [2024-11-28 13:40:22] iteration 950/ 1000 | consumed samples: 60800 | elapsed time per iteration (ms): 145884.4 | throughput per GPU (TFLOP/s/GPU): 52.8 | learning rate: 1.320541E-07 | global batch size: 64 | lm loss: 7.290564E-01 | loss scale: 1.0 | grad norm: 0.872 | number of skipped iterations: 0 | number of nan iterations: 0 | + [2024-11-28 13:42:08] iteration 951/ 1000 | consumed samples: 60864 | elapsed time per iteration (ms): 105541.4 | throughput per GPU (TFLOP/s/GPU): 73.0 | learning rate: 1.307874E-07 | global batch size: 64 | lm loss: 6.607154E-01 | loss scale: 1.0 | grad norm: 0.780 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x55d95678a600] mmco: unref short failure +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x55d95678a600] mmco: unref short failure +[h264 @ 0x55d95678a600] mmco: unref short failure +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x55d95678a600] mmco: unref short failure +[h264 @ 0x55d95678a600] mmco: unref short failure +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x55d95678a600] mmco: unref short failure +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x55d95678a600] mmco: unref short failure +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x555df32c1f40] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x555df32c1f40] mmco: unref short failure +[h264 @ 0x55d956b88f40] mmco: unref short failure +[h264 @ 0x55d956b88f40] mmco: unref short failure +[h264 @ 0x55d956b88f40] mmco: unref short failure +[h264 @ 0x55d956b88f40] mmco: unref short failure +[h264 @ 0x55d956b88f40] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x555df32c1f40] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x555df32c1f40] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure + [2024-11-28 13:43:15] iteration 952/ 1000 | consumed samples: 60928 | elapsed time per iteration (ms): 67102.5 | throughput per GPU (TFLOP/s/GPU): 114.9 | learning rate: 1.295461E-07 | global batch size: 64 | lm loss: 6.186213E-01 | loss scale: 1.0 | grad norm: 0.969 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555df32c1f40] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x555df32c1f40] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x555dee570b40] mmco: unref short failure +[h264 @ 0x55d955861580] mmco: unref short failure +[h264 @ 0x555dee570b40] mmco: unref short failure +[h264 @ 0x555dee570b40] mmco: unref short failure +[h264 @ 0x55d955861580] mmco: unref short failure +[h264 @ 0x55d955861580] mmco: unref short failure +[h264 @ 0x555dee570b40] mmco: unref short failure +[h264 @ 0x55d955861580] mmco: unref short failure + [2024-11-28 13:44:39] iteration 953/ 1000 | consumed samples: 60992 | elapsed time per iteration (ms): 83875.1 | throughput per GPU (TFLOP/s/GPU): 91.9 | learning rate: 1.283302E-07 | global batch size: 64 | lm loss: 6.808333E-01 | loss scale: 1.0 | grad norm: 1.033 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555ded679600] mmco: unref short failure +[h264 @ 0x55d955b8bcc0] mmco: unref short failure +[h264 @ 0x555ded679600] mmco: unref short failure +[h264 @ 0x55d955b8bcc0] mmco: unref short failure +[h264 @ 0x555ded679600] mmco: unref short failure +[h264 @ 0x55d955b8bcc0] mmco: unref short failure +[h264 @ 0x555ded679600] mmco: unref short failure +[h264 @ 0x55d955b8bcc0] mmco: unref short failure +[h264 @ 0x555dedfc6580] mmco: unref short failure +[h264 @ 0x55d9560849c0] mmco: unref short failure +[h264 @ 0x55d9565f0700] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure + [2024-11-28 13:45:46] iteration 954/ 1000 | consumed samples: 61056 | elapsed time per iteration (ms): 67345.2 | throughput per GPU (TFLOP/s/GPU): 114.5 | learning rate: 1.271397E-07 | global batch size: 64 | lm loss: 6.142411E-01 | loss scale: 1.0 | grad norm: 0.818 | number of skipped iterations: 0 | number of nan iterations: 0 | + [2024-11-28 13:47:07] iteration 955/ 1000 | consumed samples: 61120 | elapsed time per iteration (ms): 81067.4 | throughput per GPU (TFLOP/s/GPU): 95.1 | learning rate: 1.259746E-07 | global batch size: 64 | lm loss: 6.965019E-01 | loss scale: 1.0 | grad norm: 1.067 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x55d9565f0700] mmco: unref short failure +[h264 @ 0x55d9565f0700] mmco: unref short failure + [2024-11-28 13:48:25] iteration 956/ 1000 | consumed samples: 61184 | elapsed time per iteration (ms): 77374.3 | throughput per GPU (TFLOP/s/GPU): 99.6 | learning rate: 1.248349E-07 | global batch size: 64 | lm loss: 7.084676E-01 | loss scale: 1.0 | grad norm: 1.119 | number of skipped iterations: 0 | number of nan iterations: 0 | + [2024-11-28 13:49:44] iteration 957/ 1000 | consumed samples: 61248 | elapsed time per iteration (ms): 79787.0 | throughput per GPU (TFLOP/s/GPU): 96.6 | learning rate: 1.237207E-07 | global batch size: 64 | lm loss: 6.091076E-01 | loss scale: 1.0 | grad norm: 0.837 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555dedfc6580] mmco: unref short failure +[h264 @ 0x555dedfc6580] mmco: unref short failure +[h264 @ 0x55d9560849c0] mmco: unref short failure +[h264 @ 0x55d9560849c0] mmco: unref short failure + [2024-11-28 13:51:04] iteration 958/ 1000 | consumed samples: 61312 | elapsed time per iteration (ms): 79308.8 | throughput per GPU (TFLOP/s/GPU): 97.2 | learning rate: 1.226319E-07 | global batch size: 64 | lm loss: 6.252887E-01 | loss scale: 1.0 | grad norm: 0.887 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555df1cb0600] mmco: unref short failure +[h264 @ 0x555df1cb0600] mmco: unref short failure +[h264 @ 0x55d956b88f40] mmco: unref short failure +[h264 @ 0x55d956b88f40] mmco: unref short failure +[h264 @ 0x55d9560849c0] mmco: unref short failure +[h264 @ 0x555dedfc6580] mmco: unref short failure +[h264 @ 0x55d957a03080] mmco: unref short failure +[h264 @ 0x55d957a03080] mmco: unref short failure +[h264 @ 0x555ded7e1040] mmco: unref short failure +[h264 @ 0x555ded7e1040] mmco: unref short failure +[h264 @ 0x55d957a03080] mmco: unref short failure +[h264 @ 0x555ded7e1040] mmco: unref short failure +[h264 @ 0x55d957a03080] mmco: unref short failure +[h264 @ 0x555ded7e1040] mmco: unref short failure +[h264 @ 0x555dee570b40] mmco: unref short failure +[h264 @ 0x555dee570b40] mmco: unref short failure +[h264 @ 0x55d955861580] mmco: unref short failure +[h264 @ 0x55d955861580] mmco: unref short failure +[h264 @ 0x555dee570b40] mmco: unref short failure +[h264 @ 0x55d955861580] mmco: unref short failure + [2024-11-28 13:52:42] iteration 959/ 1000 | consumed samples: 61376 | elapsed time per iteration (ms): 98444.8 | throughput per GPU (TFLOP/s/GPU): 78.3 | learning rate: 1.215686E-07 | global batch size: 64 | lm loss: 6.871887E-01 | loss scale: 1.0 | grad norm: 0.990 | number of skipped iterations: 0 | number of nan iterations: 0 | + [2024-11-28 13:54:30] iteration 960/ 1000 | consumed samples: 61440 | elapsed time per iteration (ms): 108204.2 | throughput per GPU (TFLOP/s/GPU): 71.2 | learning rate: 1.205308E-07 | global batch size: 64 | lm loss: 6.405410E-01 | loss scale: 1.0 | grad norm: 0.826 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555ded7e1040] mmco: unref short failure +[h264 @ 0x555ded7e1040] mmco: unref short failure +[h264 @ 0x55d959d1ea40] mmco: unref short failure +[h264 @ 0x55d959d1ea40] mmco: unref short failure +[h264 @ 0x555ded7e1040] mmco: unref short failure +[h264 @ 0x55d957a03080] mmco: unref short failure +[h264 @ 0x555ded7e1040] mmco: unref short failure +[h264 @ 0x55d957a03080] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x55d9565f0700] mmco: unref short failure +[h264 @ 0x55d9565f0700] mmco: unref short failure +[h264 @ 0x55d956ee5780] mmco: unref short failure +[h264 @ 0x55d956ee5780] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x55d956ee5780] mmco: unref short failure +[h264 @ 0x55d956ee5780] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x55d956ee5780] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x55d956ee5780] mmco: unref short failure + [2024-11-28 13:56:02] iteration 961/ 1000 | consumed samples: 61504 | elapsed time per iteration (ms): 91685.8 | throughput per GPU (TFLOP/s/GPU): 84.1 | learning rate: 1.195184E-07 | global batch size: 64 | lm loss: 6.903030E-01 | loss scale: 1.0 | grad norm: 0.916 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555ded679600] mmco: unref short failure +[h264 @ 0x555ded679600] mmco: unref short failure +[h264 @ 0x55d956b88f40] mmco: unref short failure +[h264 @ 0x55d956b88f40] mmco: unref short failure +[h264 @ 0x555ded679600] mmco: unref short failure +[h264 @ 0x555ded679600] mmco: unref short failure +[h264 @ 0x55d956b88f40] mmco: unref short failure +[h264 @ 0x55d956b88f40] mmco: unref short failure +[h264 @ 0x555ded679600] mmco: unref short failure +[h264 @ 0x55d956b88f40] mmco: unref short failure +[h264 @ 0x555dee570b40] mmco: unref short failure +[h264 @ 0x555dee570b40] mmco: unref short failure +[h264 @ 0x555ded679600] mmco: unref short failure +[h264 @ 0x55d955861580] mmco: unref short failure +[h264 @ 0x55d955861580] mmco: unref short failure +[h264 @ 0x55d956b88f40] mmco: unref short failure +[h264 @ 0x555ded679600] mmco: unref short failure +[h264 @ 0x55d956b88f40] mmco: unref short failure +[h264 @ 0x555ded679600] mmco: unref short failure +[h264 @ 0x55d956b88f40] mmco: unref short failure +[h264 @ 0x555ded7e1040] mmco: unref short failure +[h264 @ 0x555ded7e1040] mmco: unref short failure +[h264 @ 0x55d959d1ea40] mmco: unref short failure +[h264 @ 0x55d959d1ea40] mmco: unref short failure +[h264 @ 0x555ded679600] mmco: unref short failure +[h264 @ 0x555ded679600] mmco: unref short failure +[h264 @ 0x55d956b88f40] mmco: unref short failure +[h264 @ 0x55d956b88f40] mmco: unref short failure +[h264 @ 0x55d956b88f40] mmco: unref short failure +[h264 @ 0x555ded679600] mmco: unref short failure + [2024-11-28 13:57:42] iteration 962/ 1000 | consumed samples: 61568 | elapsed time per iteration (ms): 100042.4 | throughput per GPU (TFLOP/s/GPU): 77.1 | learning rate: 1.185315E-07 | global batch size: 64 | lm loss: 6.939712E-01 | loss scale: 1.0 | grad norm: 0.913 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555dee570b40] mmco: unref short failure +[h264 @ 0x555dee570b40] mmco: unref short failure +[h264 @ 0x55d955861580] mmco: unref short failure +[h264 @ 0x55d955861580] mmco: unref short failure +[h264 @ 0x555dee570b40] mmco: unref short failure +[h264 @ 0x555dee570b40] mmco: unref short failure +[h264 @ 0x55d955861580] mmco: unref short failure +[h264 @ 0x55d955861580] mmco: unref short failure +[h264 @ 0x555dee570b40] mmco: unref short failure +[h264 @ 0x55d955861580] mmco: unref short failure +[h264 @ 0x555dee570b40] mmco: unref short failure +[h264 @ 0x55d955861580] mmco: unref short failure + [2024-11-28 13:59:10] iteration 963/ 1000 | consumed samples: 61632 | elapsed time per iteration (ms): 87425.2 | throughput per GPU (TFLOP/s/GPU): 88.2 | learning rate: 1.175702E-07 | global batch size: 64 | lm loss: 6.489189E-01 | loss scale: 1.0 | grad norm: 1.170 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555ded7e1040] mmco: unref short failure +[h264 @ 0x55d959d1ea40] mmco: unref short failure +[h264 @ 0x555ded7e1040] mmco: unref short failure +[h264 @ 0x55d959d1ea40] mmco: unref short failure + [2024-11-28 14:00:35] iteration 964/ 1000 | consumed samples: 61696 | elapsed time per iteration (ms): 85013.2 | throughput per GPU (TFLOP/s/GPU): 90.7 | learning rate: 1.166343E-07 | global batch size: 64 | lm loss: 6.793079E-01 | loss scale: 1.0 | grad norm: 0.955 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555ded679600] mmco: unref short failure +[h264 @ 0x555ded679600] mmco: unref short failure +[h264 @ 0x55d956b88f40] mmco: unref short failure +[h264 @ 0x55d956b88f40] mmco: unref short failure +[h264 @ 0x555ded7e1040] mmco: unref short failure +[h264 @ 0x555ded7e1040] mmco: unref short failure +[h264 @ 0x555ded7e1040] mmco: unref short failure +[h264 @ 0x555ded7e1040] mmco: unref short failure +[h264 @ 0x55d958e05fc0] mmco: unref short failure +[h264 @ 0x55d958e05fc0] mmco: unref short failure +[h264 @ 0x555ded7e1040] mmco: unref short failure +[h264 @ 0x555ded7e1040] mmco: unref short failure +[h264 @ 0x55d958e05fc0] mmco: unref short failure +[h264 @ 0x55d958e05fc0] mmco: unref short failure +[h264 @ 0x55d958e05fc0] mmco: unref short failure +[h264 @ 0x55d958e05fc0] mmco: unref short failure +[h264 @ 0x555ded7e1040] mmco: unref short failure +[h264 @ 0x55d958e05fc0] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x555ded7e1040] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x55d956042040] mmco: unref short failure +[h264 @ 0x55d956042040] mmco: unref short failure +[h264 @ 0x555df0f7cd00] mmco: unref short failure +[h264 @ 0x55d957a03080] mmco: unref short failure +[h264 @ 0x55d956042040] mmco: unref short failure +[h264 @ 0x55d956f7b580] mmco: unref short failure +[h264 @ 0x555df0f7cd00] mmco: unref short failure +[h264 @ 0x55d956f7b580] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x55d956042040] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x55d956042040] mmco: unref short failure +[h264 @ 0x55d956042040] mmco: unref short failure +[h264 @ 0x55d956042040] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x55d956042040] mmco: unref short failure +[h264 @ 0x55d956042040] mmco: unref short failure + [2024-11-28 14:02:34] iteration 965/ 1000 | consumed samples: 61760 | elapsed time per iteration (ms): 119301.6 | throughput per GPU (TFLOP/s/GPU): 64.6 | learning rate: 1.157240E-07 | global batch size: 64 | lm loss: 5.998636E-01 | loss scale: 1.0 | grad norm: 0.835 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x55d95827cc40] mmco: unref short failure +[h264 @ 0x555dec836c00] mmco: unref short failure + [2024-11-28 14:04:00] iteration 966/ 1000 | consumed samples: 61824 | elapsed time per iteration (ms): 85603.7 | throughput per GPU (TFLOP/s/GPU): 90.0 | learning rate: 1.148392E-07 | global batch size: 64 | lm loss: 5.948734E-01 | loss scale: 1.0 | grad norm: 1.641 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555ded7e1040] mmco: unref short failure +[h264 @ 0x555ded7e1040] mmco: unref short failure +[h264 @ 0x55d957a03080] mmco: unref short failure +[h264 @ 0x55d957a03080] mmco: unref short failure +[h264 @ 0x555df4961e80] mmco: unref short failure +[h264 @ 0x55d959b34680] mmco: unref short failure +[h264 @ 0x555df4961e80] mmco: unref short failure +[h264 @ 0x555df4961e80] mmco: unref short failure +[h264 @ 0x55d959b34680] mmco: unref short failure +[h264 @ 0x55d959b34680] mmco: unref short failure +[h264 @ 0x555df4961e80] mmco: unref short failure +[h264 @ 0x55d959b34680] mmco: unref short failure +[h264 @ 0x555df4961e80] mmco: unref short failure +[h264 @ 0x555df4961e80] mmco: unref short failure +[h264 @ 0x55d959b34680] mmco: unref short failure +[h264 @ 0x55d959b34680] mmco: unref short failure + [2024-11-28 14:05:11] iteration 967/ 1000 | consumed samples: 61888 | elapsed time per iteration (ms): 71636.9 | throughput per GPU (TFLOP/s/GPU): 107.6 | learning rate: 1.139800E-07 | global batch size: 64 | lm loss: 6.451741E-01 | loss scale: 1.0 | grad norm: 0.939 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x55d9593709c0] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure + [2024-11-28 14:06:19] iteration 968/ 1000 | consumed samples: 61952 | elapsed time per iteration (ms): 67761.0 | throughput per GPU (TFLOP/s/GPU): 113.8 | learning rate: 1.131463E-07 | global batch size: 64 | lm loss: 6.464132E-01 | loss scale: 1.0 | grad norm: 0.830 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x55d9593709c0] mmco: unref short failure +[h264 @ 0x55d9593709c0] mmco: unref short failure +[h264 @ 0x555ded70ae00] mmco: unref short failure +[h264 @ 0x555ded70ae00] mmco: unref short failure +[h264 @ 0x55d959b34680] mmco: unref short failure +[h264 @ 0x555df4961e80] mmco: unref short failure +processed_samples 3500 unjoint_samples 3500 joint_samples 240 [37535, 126039] +processed_samples 3500 unjoint_samples 3500 joint_samples 240 [37535, 126039] +[h264 @ 0x55d959d1ea40] mmco: unref short failure +processed_samples 3500 unjoint_samples 3500 joint_samples 243 [117828, 123942] +processed_samples 3500 unjoint_samples 3500 joint_samples 248 [61546, 125289] +[h264 @ 0x555ded7e1040] mmco: unref short failure +processed_samples 3500 unjoint_samples 3500 joint_samples 244 [124745, 128771] +processed_samples 3500 unjoint_samples 3500 joint_samples 243 [117828, 123942] +processed_samples 3500 unjoint_samples 3500 joint_samples 242 [127735, 101974] +processed_samples 3500 unjoint_samples 3500 joint_samples 248 [61546, 125289] +processed_samples 3500 unjoint_samples 3500 joint_samples 240 [117489, 97448] +processed_samples 3500 unjoint_samples 3500 joint_samples 243 [99233, 90783] +processed_samples 3500 unjoint_samples 3500 joint_samples 244 [124745, 128771] +processed_samples 3501 unjoint_samples 3500 joint_samples 236 [111699, 114901] +processed_samples 3500 unjoint_samples 3500 joint_samples 242 [127735, 101974] +processed_samples 3500 unjoint_samples 3500 joint_samples 240 [117489, 97448] +processed_samples 3500 unjoint_samples 3500 joint_samples 243 [99233, 90783] +processed_samples 3501 unjoint_samples 3500 joint_samples 236 [111699, 114901] +[h264 @ 0x55d9581bc7c0] mmco: unref short failure +[h264 @ 0x55d9581bc7c0] mmco: unref short failure +[h264 @ 0x555dee588f00] mmco: unref short failure +[h264 @ 0x555dee588f00] mmco: unref short failure +[h264 @ 0x55d9581bc7c0] mmco: unref short failure +[h264 @ 0x555dee588f00] mmco: unref short failure + [2024-11-28 14:07:46] iteration 969/ 1000 | consumed samples: 62016 | elapsed time per iteration (ms): 87505.2 | throughput per GPU (TFLOP/s/GPU): 88.1 | learning rate: 1.123382E-07 | global batch size: 64 | lm loss: 6.732799E-01 | loss scale: 1.0 | grad norm: 0.883 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555ded7e1040] mmco: unref short failure +[h264 @ 0x555ded7e1040] mmco: unref short failure +[h264 @ 0x55d957a03080] mmco: unref short failure +[h264 @ 0x55d957a03080] mmco: unref short failure +[h264 @ 0x555ded7e1040] mmco: unref short failure +[h264 @ 0x55d957a03080] mmco: unref short failure +[h264 @ 0x555ded7e1040] mmco: unref short failure +[h264 @ 0x555ded7e1040] mmco: unref short failure +[h264 @ 0x55d957a03080] mmco: unref short failure +[h264 @ 0x55d957a03080] mmco: unref short failure +[h264 @ 0x555ded7e1040] mmco: unref short failure +[h264 @ 0x55d957a03080] mmco: unref short failure +[h264 @ 0x555ded7e1040] mmco: unref short failure +[h264 @ 0x555ded7e1040] mmco: unref short failure +[h264 @ 0x55d957a03080] mmco: unref short failure +[h264 @ 0x55d957a03080] mmco: unref short failure +[h264 @ 0x555ded7e1040] mmco: unref short failure +[h264 @ 0x555ded7e1040] mmco: unref short failure +[h264 @ 0x55d957a03080] mmco: unref short failure +[h264 @ 0x55d957a03080] mmco: unref short failure +[h264 @ 0x555ded7e1040] mmco: unref short failure +[h264 @ 0x55d957a03080] mmco: unref short failure +[h264 @ 0x555ded7e1040] mmco: unref short failure +[h264 @ 0x55d957a03080] mmco: unref short failure +[h264 @ 0x555ded7e1040] mmco: unref short failure +[h264 @ 0x55d957a03080] mmco: unref short failure +[h264 @ 0x555ded7e1040] mmco: unref short failure +[h264 @ 0x55d957a03080] mmco: unref short failure +[h264 @ 0x555ded7e1040] mmco: unref short failure +[h264 @ 0x55d957a03080] mmco: unref short failure +[h264 @ 0x555df4961e80] mmco: unref short failure +[h264 @ 0x555ded7e1040] mmco: unref short failure +[h264 @ 0x555ded7e1040] mmco: unref short failure +[h264 @ 0x55d957a03080] mmco: unref short failure +[h264 @ 0x55d957a03080] mmco: unref short failure +[h264 @ 0x55d9567ebec0] mmco: unref short failure + [2024-11-28 14:09:42] iteration 970/ 1000 | consumed samples: 62080 | elapsed time per iteration (ms): 115166.3 | throughput per GPU (TFLOP/s/GPU): 66.9 | learning rate: 1.115556E-07 | global batch size: 64 | lm loss: 6.146312E-01 | loss scale: 1.0 | grad norm: 0.805 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x55d9567ebec0] mmco: unref short failure +[h264 @ 0x55d9567ebec0] mmco: unref short failure +[h264 @ 0x555df4961e80] mmco: unref short failure +[h264 @ 0x555df4961e80] mmco: unref short failure + [2024-11-28 14:10:59] iteration 971/ 1000 | consumed samples: 62144 | elapsed time per iteration (ms): 77734.4 | throughput per GPU (TFLOP/s/GPU): 99.2 | learning rate: 1.107986E-07 | global batch size: 64 | lm loss: 6.231325E-01 | loss scale: 1.0 | grad norm: 0.734 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x55d955e4acc0] mmco: unref short failure +[h264 @ 0x55d955e4acc0] mmco: unref short failure +[h264 @ 0x555dee0de680] mmco: unref short failure +[h264 @ 0x555dee0de680] mmco: unref short failure +[h264 @ 0x55d955e4acc0] mmco: unref short failure +[h264 @ 0x555dee0de680] mmco: unref short failure +[h264 @ 0x55d955e4acc0] mmco: unref short failure +[h264 @ 0x55d955e4acc0] mmco: unref short failure +[h264 @ 0x555dee0de680] mmco: unref short failure +[h264 @ 0x555dee0de680] mmco: unref short failure +[h264 @ 0x55d955e4acc0] mmco: unref short failure +[h264 @ 0x555dee0de680] mmco: unref short failure +[h264 @ 0x55d955e4acc0] mmco: unref short failure +[h264 @ 0x555dee0de680] mmco: unref short failure +[h264 @ 0x555dece9f180] mmco: unref short failure +[h264 @ 0x55d955d24640] mmco: unref short failure +[h264 @ 0x555dece9f180] mmco: unref short failure +[h264 @ 0x55d955d24640] mmco: unref short failure +[h264 @ 0x555dece9f180] mmco: unref short failure +[h264 @ 0x555dece9f180] mmco: unref short failure +[h264 @ 0x55d955d24640] mmco: unref short failure +[h264 @ 0x55d955d24640] mmco: unref short failure +[h264 @ 0x555dece9f180] mmco: unref short failure +[h264 @ 0x55d955d24640] mmco: unref short failure + [2024-11-28 14:12:25] iteration 972/ 1000 | consumed samples: 62208 | elapsed time per iteration (ms): 85280.0 | throughput per GPU (TFLOP/s/GPU): 90.4 | learning rate: 1.100672E-07 | global batch size: 64 | lm loss: 6.697411E-01 | loss scale: 1.0 | grad norm: 0.819 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x55d956ee5780] mmco: unref short failure +[h264 @ 0x55d956ee5780] mmco: unref short failure +[h264 @ 0x555dedf05880] mmco: unref short failure +[h264 @ 0x555dedf05880] mmco: unref short failure +[h264 @ 0x555ded7e1040] mmco: unref short failure +[h264 @ 0x555ded7e1040] mmco: unref short failure +[h264 @ 0x55d957a03080] mmco: unref short failure +[h264 @ 0x55d957a03080] mmco: unref short failure +[h264 @ 0x555deee52400] mmco: unref short failure +[h264 @ 0x555deee52400] mmco: unref short failure +[h264 @ 0x55d956386f00] mmco: unref short failure +[h264 @ 0x55d956386f00] mmco: unref short failure +[h264 @ 0x555deee52400] mmco: unref short failure +[h264 @ 0x55d956386f00] mmco: unref short failure +[h264 @ 0x555deee52400] mmco: unref short failure +[h264 @ 0x555deee52400] mmco: unref short failure +[h264 @ 0x55d956386f00] mmco: unref short failure +[h264 @ 0x55d956386f00] mmco: unref short failure +[h264 @ 0x555deee52400] mmco: unref short failure +[h264 @ 0x55d956386f00] mmco: unref short failure +[h264 @ 0x555dec836c00] mmco: unref short failure +[h264 @ 0x55d95827cc40] mmco: unref short failure +[h264 @ 0x555deee52400] mmco: unref short failure +[h264 @ 0x555deee52400] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x555deee52400] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x555deee52400] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x555deee52400] mmco: unref short failure +[h264 @ 0x555deee52400] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x555deee52400] mmco: unref short failure + [2024-11-28 14:13:45] iteration 973/ 1000 | consumed samples: 62272 | elapsed time per iteration (ms): 80061.8 | throughput per GPU (TFLOP/s/GPU): 96.3 | learning rate: 1.093615E-07 | global batch size: 64 | lm loss: 6.537416E-01 | loss scale: 1.0 | grad norm: 0.809 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x55d957cd6140] mmco: unref short failure +[h264 @ 0x555ded70ae00] mmco: unref short failure +[h264 @ 0x55d957cd6140] mmco: unref short failure +[h264 @ 0x555ded70ae00] mmco: unref short failure +[h264 @ 0x55d957cd6140] mmco: unref short failure +[h264 @ 0x555ded70ae00] mmco: unref short failure +[h264 @ 0x55d957cd6140] mmco: unref short failure +[h264 @ 0x555ded70ae00] mmco: unref short failure + [2024-11-28 14:15:17] iteration 974/ 1000 | consumed samples: 62336 | elapsed time per iteration (ms): 92646.4 | throughput per GPU (TFLOP/s/GPU): 83.2 | learning rate: 1.086813E-07 | global batch size: 64 | lm loss: 7.027911E-01 | loss scale: 1.0 | grad norm: 0.914 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555dec836c00] mmco: unref short failure +[h264 @ 0x555dec836c00] mmco: unref short failure +[h264 @ 0x55d95827cc40] mmco: unref short failure +[h264 @ 0x55d95827cc40] mmco: unref short failure +[h264 @ 0x555dec836c00] mmco: unref short failure +[h264 @ 0x555dec836c00] mmco: unref short failure +[h264 @ 0x555dec836c00] mmco: unref short failure +[h264 @ 0x555dec836c00] mmco: unref short failure +[h264 @ 0x55d95827cc40] mmco: unref short failure +[h264 @ 0x55d95827cc40] mmco: unref short failure +[h264 @ 0x55d95827cc40] mmco: unref short failure +[h264 @ 0x55d95827cc40] mmco: unref short failure + [2024-11-28 14:16:52] iteration 975/ 1000 | consumed samples: 62400 | elapsed time per iteration (ms): 94500.1 | throughput per GPU (TFLOP/s/GPU): 81.6 | learning rate: 1.080267E-07 | global batch size: 64 | lm loss: 7.508754E-01 | loss scale: 1.0 | grad norm: 1.033 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x55d95c76c1c0] mmco: unref short failure +[h264 @ 0x55d95c76c1c0] mmco: unref short failure +[h264 @ 0x555dee0f8a80] mmco: unref short failure +[h264 @ 0x555dee0f8a80] mmco: unref short failure +[h264 @ 0x55d95c76c1c0] mmco: unref short failure +[h264 @ 0x555dee0f8a80] mmco: unref short failure +[h264 @ 0x55d956aae040] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x55d955d24640] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x55d955d24640] mmco: unref short failure +[h264 @ 0x55d955d24640] mmco: unref short failure +[h264 @ 0x555ded70ae00] mmco: unref short failure +[h264 @ 0x555ded70ae00] mmco: unref short failure +[h264 @ 0x55d955d24640] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x55d955d24640] mmco: unref short failure +[h264 @ 0x555ded70ae00] mmco: unref short failure +[h264 @ 0x55d955d24640] mmco: unref short failure +[h264 @ 0x55d955d24640] mmco: unref short failure +[h264 @ 0x555ded70ae00] mmco: unref short failure +[h264 @ 0x555ded70ae00] mmco: unref short failure +[h264 @ 0x555ded70ae00] mmco: unref short failure +[h264 @ 0x55d955d24640] mmco: unref short failure +[h264 @ 0x555dec836c00] mmco: unref short failure +[h264 @ 0x555dec836c00] mmco: unref short failure +[h264 @ 0x555ded70ae00] mmco: unref short failure +[h264 @ 0x55d955d24640] mmco: unref short failure +[h264 @ 0x55d95827cc40] mmco: unref short failure +[h264 @ 0x55d95827cc40] mmco: unref short failure +[h264 @ 0x555dec836c00] mmco: unref short failure +[h264 @ 0x55d95827cc40] mmco: unref short failure +[h264 @ 0x555ded70ae00] mmco: unref short failure +[h264 @ 0x555ded70ae00] mmco: unref short failure +[h264 @ 0x55d955d24640] mmco: unref short failure +[h264 @ 0x55d955d24640] mmco: unref short failure +[h264 @ 0x555dec836c00] mmco: unref short failure +[h264 @ 0x555dec836c00] mmco: unref short failure +[h264 @ 0x55d95827cc40] mmco: unref short failure +[h264 @ 0x55d95827cc40] mmco: unref short failure +[h264 @ 0x555dec836c00] mmco: unref short failure +[h264 @ 0x55d95827cc40] mmco: unref short failure +[h264 @ 0x555dec836c00] mmco: unref short failure +[h264 @ 0x55d95827cc40] mmco: unref short failure + [2024-11-28 14:18:09] iteration 976/ 1000 | consumed samples: 62464 | elapsed time per iteration (ms): 77501.4 | throughput per GPU (TFLOP/s/GPU): 99.5 | learning rate: 1.073977E-07 | global batch size: 64 | lm loss: 7.104969E-01 | loss scale: 1.0 | grad norm: 1.024 | number of skipped iterations: 0 | number of nan iterations: 0 | + [2024-11-28 14:19:30] iteration 977/ 1000 | consumed samples: 62528 | elapsed time per iteration (ms): 80497.2 | throughput per GPU (TFLOP/s/GPU): 95.8 | learning rate: 1.067943E-07 | global batch size: 64 | lm loss: 6.504764E-01 | loss scale: 1.0 | grad norm: 0.771 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555df4961e80] mmco: unref short failure +[h264 @ 0x55d957883f80] mmco: unref short failure +[h264 @ 0x55d955d24640] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x55d955d24640] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x55d955d24640] mmco: unref short failure +[h264 @ 0x55d955d24640] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x55d95827cc40] mmco: unref short failure +[h264 @ 0x555dec836c00] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x555df32c1f40] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x555df32c1f40] mmco: unref short failure +[h264 @ 0x555df32c1f40] mmco: unref short failure +[h264 @ 0x55d9581bc7c0] mmco: unref short failure +[h264 @ 0x55d9581bc7c0] mmco: unref short failure +[h264 @ 0x555dee0de680] mmco: unref short failure +[h264 @ 0x555dee0de680] mmco: unref short failure +[h264 @ 0x55d9581bc7c0] mmco: unref short failure +[h264 @ 0x555dee0de680] mmco: unref short failure +[h264 @ 0x55d9581bc7c0] mmco: unref short failure +[h264 @ 0x55d9581bc7c0] mmco: unref short failure +[h264 @ 0x555dee0de680] mmco: unref short failure +[h264 @ 0x555dee0de680] mmco: unref short failure +[h264 @ 0x55d9581bc7c0] mmco: unref short failure +[h264 @ 0x55d9581bc7c0] mmco: unref short failure +[h264 @ 0x555dee0de680] mmco: unref short failure +[h264 @ 0x555dee0de680] mmco: unref short failure +[h264 @ 0x555dec836c00] mmco: unref short failure +[h264 @ 0x555dec836c00] mmco: unref short failure +[h264 @ 0x55d95827cc40] mmco: unref short failure +[h264 @ 0x55d95827cc40] mmco: unref short failure +[h264 @ 0x555dec836c00] mmco: unref short failure +[h264 @ 0x55d95827cc40] mmco: unref short failure + [2024-11-28 14:20:53] iteration 978/ 1000 | consumed samples: 62592 | elapsed time per iteration (ms): 83270.3 | throughput per GPU (TFLOP/s/GPU): 92.6 | learning rate: 1.062166E-07 | global batch size: 64 | lm loss: 7.602927E-01 | loss scale: 1.0 | grad norm: 0.914 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555dee0de680] mmco: unref short failure +[h264 @ 0x555dee0de680] mmco: unref short failure +[h264 @ 0x55d957fe7780] mmco: unref short failure +[h264 @ 0x55d957fe7780] mmco: unref short failure + [2024-11-28 14:22:28] iteration 979/ 1000 | consumed samples: 62656 | elapsed time per iteration (ms): 94731.5 | throughput per GPU (TFLOP/s/GPU): 81.4 | learning rate: 1.056645E-07 | global batch size: 64 | lm loss: 6.811559E-01 | loss scale: 1.0 | grad norm: 0.965 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x55d95c76c1c0] mmco: unref short failure +[h264 @ 0x555deda84c00] mmco: unref short failure +[h264 @ 0x55d95c76c1c0] mmco: unref short failure +[h264 @ 0x555deda84c00] mmco: unref short failure +[h264 @ 0x55d95c76c1c0] mmco: unref short failure +[h264 @ 0x55d95c76c1c0] mmco: unref short failure +[h264 @ 0x555deda84c00] mmco: unref short failure +[h264 @ 0x555deda84c00] mmco: unref short failure +[h264 @ 0x55d95c76c1c0] mmco: unref short failure +[h264 @ 0x55d95c76c1c0] mmco: unref short failure +[h264 @ 0x555deda84c00] mmco: unref short failure +[h264 @ 0x555deda84c00] mmco: unref short failure +[h264 @ 0x55d95c76c1c0] mmco: unref short failure +[h264 @ 0x555deda84c00] mmco: unref short failure +[h264 @ 0x555dedfc6580] mmco: unref short failure +[h264 @ 0x555dedfc6580] mmco: unref short failure +[h264 @ 0x55d95827cc40] mmco: unref short failure +[h264 @ 0x555dec836c00] mmco: unref short failure +[h264 @ 0x55d959d1ea40] mmco: unref short failure +[h264 @ 0x55d959d1ea40] mmco: unref short failure +[h264 @ 0x555dedfc6580] mmco: unref short failure +[h264 @ 0x555dedfc6580] mmco: unref short failure +[h264 @ 0x555dedfc6580] mmco: unref short failure +[h264 @ 0x55d959d1ea40] mmco: unref short failure +[h264 @ 0x555dedfc6580] mmco: unref short failure +[h264 @ 0x555dedfc6580] mmco: unref short failure +[h264 @ 0x55d959d1ea40] mmco: unref short failure +[h264 @ 0x55d959d1ea40] mmco: unref short failure +[h264 @ 0x55d959d1ea40] mmco: unref short failure +[h264 @ 0x55d959d1ea40] mmco: unref short failure +[h264 @ 0x555dec836c00] mmco: unref short failure +[h264 @ 0x555dec836c00] mmco: unref short failure +[h264 @ 0x55d95827cc40] mmco: unref short failure +[h264 @ 0x55d95827cc40] mmco: unref short failure +[h264 @ 0x555dec836c00] mmco: unref short failure +[h264 @ 0x555dec836c00] mmco: unref short failure +[h264 @ 0x555dedfc6580] mmco: unref short failure +[h264 @ 0x555dedfc6580] mmco: unref short failure +[h264 @ 0x55d95827cc40] mmco: unref short failure +[h264 @ 0x55d95827cc40] mmco: unref short failure +[h264 @ 0x55d959d1ea40] mmco: unref short failure +[h264 @ 0x55d959d1ea40] mmco: unref short failure +[h264 @ 0x555dec836c00] mmco: unref short failure +[h264 @ 0x55d95827cc40] mmco: unref short failure +[h264 @ 0x555dec836c00] mmco: unref short failure +[h264 @ 0x555dec836c00] mmco: unref short failure +[h264 @ 0x55d95827cc40] mmco: unref short failure +[h264 @ 0x55d95827cc40] mmco: unref short failure +[h264 @ 0x555dec836c00] mmco: unref short failure +[h264 @ 0x55d95827cc40] mmco: unref short failure +[h264 @ 0x55d95827cc40] mmco: unref short failure +[h264 @ 0x55d95827cc40] mmco: unref short failure +[h264 @ 0x555dec836c00] mmco: unref short failure +[h264 @ 0x555dec836c00] mmco: unref short failure +[h264 @ 0x55d95827cc40] mmco: unref short failure +[h264 @ 0x555dec836c00] mmco: unref short failure +[h264 @ 0x55d957cd6140] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure + [2024-11-28 14:24:05] iteration 980/ 1000 | consumed samples: 62720 | elapsed time per iteration (ms): 97338.2 | throughput per GPU (TFLOP/s/GPU): 79.2 | learning rate: 1.051381E-07 | global batch size: 64 | lm loss: 6.308494E-01 | loss scale: 1.0 | grad norm: 0.862 | number of skipped iterations: 0 | number of nan iterations: 0 | + [2024-11-28 14:25:32] iteration 981/ 1000 | consumed samples: 62784 | elapsed time per iteration (ms): 86780.9 | throughput per GPU (TFLOP/s/GPU): 88.8 | learning rate: 1.046373E-07 | global batch size: 64 | lm loss: 6.862647E-01 | loss scale: 1.0 | grad norm: 0.902 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555dedfc6580] mmco: unref short failure +[h264 @ 0x55d957a03080] mmco: unref short failure +[h264 @ 0x555dedfc6580] mmco: unref short failure +[h264 @ 0x55d957a03080] mmco: unref short failure +[h264 @ 0x555df1cb0600] mmco: unref short failure +[h264 @ 0x55d95c76c1c0] mmco: unref short failure +[h264 @ 0x555df1cb0600] mmco: unref short failure +[h264 @ 0x555df1cb0600] mmco: unref short failure +[h264 @ 0x55d95c76c1c0] mmco: unref short failure +[h264 @ 0x55d95c76c1c0] mmco: unref short failure + [2024-11-28 14:27:22] iteration 982/ 1000 | consumed samples: 62848 | elapsed time per iteration (ms): 110273.4 | throughput per GPU (TFLOP/s/GPU): 69.9 | learning rate: 1.041621E-07 | global batch size: 64 | lm loss: 6.260334E-01 | loss scale: 1.0 | grad norm: 0.784 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555dec836c00] mmco: unref short failure +[h264 @ 0x555dec836c00] mmco: unref short failure +[h264 @ 0x55d95827cc40] mmco: unref short failure +[h264 @ 0x55d95827cc40] mmco: unref short failure +[h264 @ 0x55d955acf300] mmco: unref short failure +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x555dec836c00] mmco: unref short failure +[h264 @ 0x55d95827cc40] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x55d955d24640] mmco: unref short failure +[h264 @ 0x555dedfc6580] mmco: unref short failure +[h264 @ 0x555dedfc6580] mmco: unref short failure +[h264 @ 0x55d959d1ea40] mmco: unref short failure +[h264 @ 0x555dedb7a200] mmco: unref short failure +[h264 @ 0x55d959d1ea40] mmco: unref short failure +[h264 @ 0x55d956042040] mmco: unref short failure +[h264 @ 0x555dedb7a200] mmco: unref short failure +[h264 @ 0x555dedb7a200] mmco: unref short failure +[h264 @ 0x555dedb7a200] mmco: unref short failure +[h264 @ 0x555dedb7a200] mmco: unref short failure +[h264 @ 0x555dedb7a200] mmco: unref short failure +[h264 @ 0x55d956042040] mmco: unref short failure +[h264 @ 0x555dedb7a200] mmco: unref short failure +[h264 @ 0x555dedb7a200] mmco: unref short failure +[h264 @ 0x555df1cb0600] mmco: unref short failure +[h264 @ 0x55d956042040] mmco: unref short failure +[h264 @ 0x55d956042040] mmco: unref short failure +[h264 @ 0x55d956042040] mmco: unref short failure +[h264 @ 0x55d956042040] mmco: unref short failure +[h264 @ 0x55d95c76c1c0] mmco: unref short failure +[h264 @ 0x55d956042040] mmco: unref short failure +[h264 @ 0x55d956042040] mmco: unref short failure +[h264 @ 0x555dedb7a200] mmco: unref short failure +[h264 @ 0x555dedb7a200] mmco: unref short failure +[h264 @ 0x555dedb7a200] mmco: unref short failure +[h264 @ 0x555dedb7a200] mmco: unref short failure +[h264 @ 0x55d956042040] mmco: unref short failure +[h264 @ 0x55d956042040] mmco: unref short failure +[h264 @ 0x55d956042040] mmco: unref short failure +[h264 @ 0x55d956042040] mmco: unref short failure +[h264 @ 0x555dedb7a200] mmco: unref short failure +[h264 @ 0x555dedb7a200] mmco: unref short failure +[h264 @ 0x555dedb7a200] mmco: unref short failure +[h264 @ 0x555dedb7a200] mmco: unref short failure +[h264 @ 0x55d956042040] mmco: unref short failure +[h264 @ 0x55d956042040] mmco: unref short failure +[h264 @ 0x555dedb7a200] mmco: unref short failure +[h264 @ 0x555dedb7a200] mmco: unref short failure +[h264 @ 0x55d956042040] mmco: unref short failure +[h264 @ 0x55d956042040] mmco: unref short failure +[h264 @ 0x55d956042040] mmco: unref short failure +[h264 @ 0x55d956042040] mmco: unref short failure + [2024-11-28 14:28:41] iteration 983/ 1000 | consumed samples: 62912 | elapsed time per iteration (ms): 78985.5 | throughput per GPU (TFLOP/s/GPU): 97.6 | learning rate: 1.037126E-07 | global batch size: 64 | lm loss: 6.394918E-01 | loss scale: 1.0 | grad norm: 0.794 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x55d9581bc7c0] mmco: unref short failure +[h264 @ 0x55d9581bc7c0] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x55d9581bc7c0] mmco: unref short failure +[h264 @ 0x55d9581bc7c0] mmco: unref short failure +[h264 @ 0x555dec811980] mmco: unref short failure +[h264 @ 0x555dec811980] mmco: unref short failure +[h264 @ 0x55d95c76c1c0] mmco: unref short failure +[h264 @ 0x55d95c76c1c0] mmco: unref short failure +[h264 @ 0x555dec811980] mmco: unref short failure +[h264 @ 0x555dec811980] mmco: unref short failure +[h264 @ 0x55d95c76c1c0] mmco: unref short failure +[h264 @ 0x55d95c76c1c0] mmco: unref short failure + [2024-11-28 14:30:07] iteration 984/ 1000 | consumed samples: 62976 | elapsed time per iteration (ms): 85222.2 | throughput per GPU (TFLOP/s/GPU): 90.5 | learning rate: 1.032888E-07 | global batch size: 64 | lm loss: 6.519013E-01 | loss scale: 1.0 | grad norm: 0.713 | number of skipped iterations: 0 | number of nan iterations: 0 | + [2024-11-28 14:31:21] iteration 985/ 1000 | consumed samples: 63040 | elapsed time per iteration (ms): 74396.5 | throughput per GPU (TFLOP/s/GPU): 103.6 | learning rate: 1.028906E-07 | global batch size: 64 | lm loss: 6.667175E-01 | loss scale: 1.0 | grad norm: 0.862 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x55d956da5240] mmco: unref short failure +[h264 @ 0x555dec811980] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x55d9581bc7c0] mmco: unref short failure +[h264 @ 0x55d9581bc7c0] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x55d9581bc7c0] mmco: unref short failure +[h264 @ 0x55d9581bc7c0] mmco: unref short failure +[h264 @ 0x55d956da5240] mmco: unref short failure +[h264 @ 0x555dec811980] mmco: unref short failure +[h264 @ 0x55d956da5240] mmco: unref short failure +[h264 @ 0x55d956da5240] mmco: unref short failure +[h264 @ 0x555dec811980] mmco: unref short failure +[h264 @ 0x555dec811980] mmco: unref short failure + [2024-11-28 14:32:53] iteration 986/ 1000 | consumed samples: 63104 | elapsed time per iteration (ms): 92158.6 | throughput per GPU (TFLOP/s/GPU): 83.6 | learning rate: 1.025181E-07 | global batch size: 64 | lm loss: 6.164773E-01 | loss scale: 1.0 | grad norm: 1.084 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x55d95c76c1c0] mmco: unref short failure +[h264 @ 0x55d95c76c1c0] mmco: unref short failure +[h264 @ 0x555deda84c00] mmco: unref short failure +[h264 @ 0x555deda84c00] mmco: unref short failure +[h264 @ 0x55d95c76c1c0] mmco: unref short failure +[h264 @ 0x555deda84c00] mmco: unref short failure +[h264 @ 0x55d956f36e40] mmco: unref short failure +[h264 @ 0x55d956f36e40] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x55d956f36e40] mmco: unref short failure +[h264 @ 0x55d956f36e40] mmco: unref short failure +[h264 @ 0x55d956f36e40] mmco: unref short failure +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x55d956f36e40] mmco: unref short failure +[h264 @ 0x55d956f36e40] mmco: unref short failure +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x55d956f36e40] mmco: unref short failure +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x55d956f36e40] mmco: unref short failure +[h264 @ 0x55d956f36e40] mmco: unref short failure +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x55d956f36e40] mmco: unref short failure +[h264 @ 0x55d956f36e40] mmco: unref short failure +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x55d956f36e40] mmco: unref short failure +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x55d956f36e40] mmco: unref short failure +[h264 @ 0x555dec9973c0] mmco: unref short failure + [2024-11-28 14:34:18] iteration 987/ 1000 | consumed samples: 63168 | elapsed time per iteration (ms): 85187.1 | throughput per GPU (TFLOP/s/GPU): 90.5 | learning rate: 1.021713E-07 | global batch size: 64 | lm loss: 6.563728E-01 | loss scale: 1.0 | grad norm: 0.750 | number of skipped iterations: 0 | number of nan iterations: 0 | + [2024-11-28 14:35:51] iteration 988/ 1000 | consumed samples: 63232 | elapsed time per iteration (ms): 92580.3 | throughput per GPU (TFLOP/s/GPU): 83.3 | learning rate: 1.018501E-07 | global batch size: 64 | lm loss: 6.570765E-01 | loss scale: 1.0 | grad norm: 0.765 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555def216fc0] mmco: unref short failure +[h264 @ 0x55d95b718d00] mmco: unref short failure +[h264 @ 0x55d95678a600] mmco: unref short failure +[h264 @ 0x55d95678a600] mmco: unref short failure +[h264 @ 0x555dedf89ec0] mmco: unref short failure +[h264 @ 0x555dedf89ec0] mmco: unref short failure +[h264 @ 0x555def216fc0] mmco: unref short failure +[h264 @ 0x555def216fc0] mmco: unref short failure +[h264 @ 0x55d95b718d00] mmco: unref short failure +[h264 @ 0x55d95b718d00] mmco: unref short failure +[h264 @ 0x55d9568d0900] mmco: unref short failure +[h264 @ 0x555dece9f180] mmco: unref short failure +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x555df4169bc0] mmco: unref short failure +[h264 @ 0x55d955d4f640] mmco: unref short failure +[h264 @ 0x55d955d4f640] mmco: unref short failure +[h264 @ 0x55d9568d0900] mmco: unref short failure +[h264 @ 0x555dece9f180] mmco: unref short failure + [2024-11-28 14:36:59] iteration 989/ 1000 | consumed samples: 63296 | elapsed time per iteration (ms): 67976.0 | throughput per GPU (TFLOP/s/GPU): 113.4 | learning rate: 1.015546E-07 | global batch size: 64 | lm loss: 7.026603E-01 | loss scale: 1.0 | grad norm: 0.793 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555df4961e80] mmco: unref short failure +[h264 @ 0x55d95678a600] mmco: unref short failure +[h264 @ 0x55d95678a600] mmco: unref short failure +[h264 @ 0x55d95678a600] mmco: unref short failure +[h264 @ 0x555df4961e80] mmco: unref short failure +[h264 @ 0x555df4961e80] mmco: unref short failure +[h264 @ 0x555deda84c00] mmco: unref short failure +[h264 @ 0x555deda84c00] mmco: unref short failure +[h264 @ 0x55d956767080] mmco: unref short failure +[h264 @ 0x55d956767080] mmco: unref short failure +[h264 @ 0x555deda84c00] mmco: unref short failure +[h264 @ 0x555deda84c00] mmco: unref short failure +[h264 @ 0x55d956767080] mmco: unref short failure +[h264 @ 0x55d956767080] mmco: unref short failure +[h264 @ 0x555deda84c00] mmco: unref short failure +[h264 @ 0x55d956767080] mmco: unref short failure +[h264 @ 0x55d956ee5780] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure + [2024-11-28 14:38:14] iteration 990/ 1000 | consumed samples: 63360 | elapsed time per iteration (ms): 75326.0 | throughput per GPU (TFLOP/s/GPU): 102.3 | learning rate: 1.012849E-07 | global batch size: 64 | lm loss: 6.594824E-01 | loss scale: 1.0 | grad norm: 0.907 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x55d95b718d00] mmco: unref short failure +[h264 @ 0x55d95b718d00] mmco: unref short failure +[h264 @ 0x555def216fc0] mmco: unref short failure +[h264 @ 0x555def216fc0] mmco: unref short failure +[h264 @ 0x555decb36640] mmco: unref short failure +[h264 @ 0x555decb36640] mmco: unref short failure +[h264 @ 0x55d9569edbc0] mmco: unref short failure +[h264 @ 0x55d9569edbc0] mmco: unref short failure +[h264 @ 0x555decb36640] mmco: unref short failure +[h264 @ 0x55d9569edbc0] mmco: unref short failure +[h264 @ 0x55d95b718d00] mmco: unref short failure +[h264 @ 0x555def216fc0] mmco: unref short failure +[h264 @ 0x55d95c76c1c0] mmco: unref short failure +[h264 @ 0x55d95c76c1c0] mmco: unref short failure +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x55d95c76c1c0] mmco: unref short failure +[h264 @ 0x55d95c76c1c0] mmco: unref short failure +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x555dec9973c0] mmco: unref short failure +[h264 @ 0x55d95b718d00] mmco: unref short failure +[h264 @ 0x555def216fc0] mmco: unref short failure +[h264 @ 0x555df1cb0600] mmco: unref short failure +[h264 @ 0x55d9569edbc0] mmco: unref short failure +[h264 @ 0x555df1cb0600] mmco: unref short failure +[h264 @ 0x555df1cb0600] mmco: unref short failure +[h264 @ 0x55d9569edbc0] mmco: unref short failure +[h264 @ 0x55d9569edbc0] mmco: unref short failure + [2024-11-28 14:39:43] iteration 991/ 1000 | consumed samples: 63424 | elapsed time per iteration (ms): 88711.4 | throughput per GPU (TFLOP/s/GPU): 86.9 | learning rate: 1.010408E-07 | global batch size: 64 | lm loss: 6.635240E-01 | loss scale: 1.0 | grad norm: 0.979 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555df0f7cd00] mmco: unref short failure +[h264 @ 0x55d957fbfd40] mmco: unref short failure +[h264 @ 0x55d957fbfd40] mmco: unref short failure +[h264 @ 0x555df0f7cd00] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x55d956f36840] mmco: unref short failure +[h264 @ 0x55d956f36840] mmco: unref short failure +[h264 @ 0x555dec5a5140] mmco: unref short failure +[h264 @ 0x55d957fbfd40] mmco: unref short failure + [2024-11-28 14:41:11] iteration 992/ 1000 | consumed samples: 63488 | elapsed time per iteration (ms): 87459.0 | throughput per GPU (TFLOP/s/GPU): 88.1 | learning rate: 1.008223E-07 | global batch size: 64 | lm loss: 7.460098E-01 | loss scale: 1.0 | grad norm: 0.901 | number of skipped iterations: 0 | number of nan iterations: 0 | + [2024-11-28 14:42:23] iteration 993/ 1000 | consumed samples: 63552 | elapsed time per iteration (ms): 72575.2 | throughput per GPU (TFLOP/s/GPU): 106.2 | learning rate: 1.006296E-07 | global batch size: 64 | lm loss: 6.943420E-01 | loss scale: 1.0 | grad norm: 0.979 | number of skipped iterations: 0 | number of nan iterations: 0 | + [2024-11-28 14:43:46] iteration 994/ 1000 | consumed samples: 63616 | elapsed time per iteration (ms): 83141.3 | throughput per GPU (TFLOP/s/GPU): 92.7 | learning rate: 1.004626E-07 | global batch size: 64 | lm loss: 6.353670E-01 | loss scale: 1.0 | grad norm: 0.996 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555ded7e1040] mmco: unref short failure +[h264 @ 0x55d95707f900] mmco: unref short failure +[h264 @ 0x555ded7e1040] [h264 @ 0x55d95707f900] mmco: unref short failure +mmco: unref short failure +[h264 @ 0x555dee100dc0] mmco: unref short failure +[h264 @ 0x555dee100dc0] mmco: unref short failure +[h264 @ 0x55d956b4f800] mmco: unref short failure +[h264 @ 0x55d956b4f800] mmco: unref short failure +[h264 @ 0x555ded7e1040] mmco: unref short failure +[h264 @ 0x555ded7e1040] mmco: unref short failure +[h264 @ 0x55d95707f900] mmco: unref short failure +[h264 @ 0x55d95707f900] mmco: unref short failure +[h264 @ 0x55d95707f900] mmco: unref short failure +[h264 @ 0x55d95707f900] mmco: unref short failure +[h264 @ 0x555ded7e1040] mmco: unref short failure +[h264 @ 0x555ded7e1040] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x55d957883f80] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x55d957883f80] mmco: unref short failure +[h264 @ 0x55d957883f80] mmco: unref short failure +[h264 @ 0x55d957883f80] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x55d957883f80] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x55d957883f80] mmco: unref short failure +[h264 @ 0x55d957883f80] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x55d956f36840] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x55d956f36840] mmco: unref short failure +[h264 @ 0x55d956f36840] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x55d956f36840] mmco: unref short failure +[h264 @ 0x55d956f36840] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x55d956f36840] mmco: unref short failure + [2024-11-28 14:45:46] iteration 995/ 1000 | consumed samples: 63680 | elapsed time per iteration (ms): 119455.2 | throughput per GPU (TFLOP/s/GPU): 64.5 | learning rate: 1.003212E-07 | global batch size: 64 | lm loss: 6.850454E-01 | loss scale: 1.0 | grad norm: 0.907 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x55d957883f80] mmco: unref short failure +[h264 @ 0x55d957883f80] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x55d957883f80] mmco: unref short failure +[h264 @ 0x55d957883f80] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x55d957883f80] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x55d957883f80] mmco: unref short failure +[h264 @ 0x55d957883f80] mmco: unref short failure +[h264 @ 0x55d956767080] mmco: unref short failure +[h264 @ 0x555ded7e1040] mmco: unref short failure +[h264 @ 0x55d956767080] mmco: unref short failure +[h264 @ 0x55d956767080] mmco: unref short failure +[h264 @ 0x555ded7e1040] mmco: unref short failure +[h264 @ 0x555ded7e1040] mmco: unref short failure +[h264 @ 0x555dee100dc0] mmco: unref short failure +[h264 @ 0x555dee100dc0] mmco: unref short failure +[h264 @ 0x555dee100dc0] mmco: unref short failure +[h264 @ 0x55d956767080] mmco: unref short failure +[h264 @ 0x55d956767080] mmco: unref short failure +[h264 @ 0x55d956767080] mmco: unref short failure +[h264 @ 0x55d9560aea40] mmco: unref short failure +[h264 @ 0x55d9560aea40] mmco: unref short failure +[h264 @ 0x555ded679600] mmco: unref short failure +[h264 @ 0x555ded679600] mmco: unref short failure + [2024-11-28 14:47:25] iteration 996/ 1000 | consumed samples: 63744 | elapsed time per iteration (ms): 99541.9 | throughput per GPU (TFLOP/s/GPU): 77.4 | learning rate: 1.002056E-07 | global batch size: 64 | lm loss: 6.471719E-01 | loss scale: 1.0 | grad norm: 1.028 | number of skipped iterations: 0 | number of nan iterations: 0 | + [2024-11-28 14:48:59] iteration 997/ 1000 | consumed samples: 63808 | elapsed time per iteration (ms): 93832.3 | throughput per GPU (TFLOP/s/GPU): 82.2 | learning rate: 1.001156E-07 | global batch size: 64 | lm loss: 6.529545E-01 | loss scale: 1.0 | grad norm: 1.344 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x55d9560aea40] mmco: unref short failure +[h264 @ 0x55d9560aea40] mmco: unref short failure +[h264 @ 0x555ded679600] mmco: unref short failure +[h264 @ 0x555ded679600] mmco: unref short failure +[h264 @ 0x555dee100dc0] mmco: unref short failure +[h264 @ 0x555dee100dc0] mmco: unref short failure +[h264 @ 0x555ded7e1040] mmco: unref short failure +[h264 @ 0x555ded7e1040] mmco: unref short failure +[h264 @ 0x55d956767080] mmco: unref short failure +[h264 @ 0x55d956767080] mmco: unref short failure +[h264 @ 0x55d95707f900] mmco: unref short failure +[h264 @ 0x55d95707f900] mmco: unref short failure +[h264 @ 0x555decb36640] mmco: unref short failure +[h264 @ 0x555decb36640] mmco: unref short failure +[h264 @ 0x55d955e4acc0] mmco: unref short failure +[h264 @ 0x55d955e4acc0] mmco: unref short failure +[h264 @ 0x555decb36640] mmco: unref short failure +[h264 @ 0x555decb36640] mmco: unref short failure +[h264 @ 0x555ded7e1040] mmco: unref short failure +[h264 @ 0x55d955e4acc0] mmco: unref short failure +[h264 @ 0x55d955e4acc0] mmco: unref short failure +[h264 @ 0x55d95707f900] mmco: unref short failure +[h264 @ 0x555dee100dc0] mmco: unref short failure +[h264 @ 0x55d956767080] mmco: unref short failure +[h264 @ 0x555dee100dc0] mmco: unref short failure +[h264 @ 0x55d956767080] mmco: unref short failure +[h264 @ 0x555dee100dc0] mmco: unref short failure +[h264 @ 0x55d956767080] mmco: unref short failure +[h264 @ 0x555dee100dc0] mmco: unref short failure +[h264 @ 0x55d956767080] mmco: unref short failure +[h264 @ 0x555ded7e1040] mmco: unref short failure +[h264 @ 0x555ded7e1040] mmco: unref short failure +[h264 @ 0x55d95707f900] mmco: unref short failure +[h264 @ 0x55d95707f900] mmco: unref short failure +[h264 @ 0x555ded696c00] mmco: unref short failure +[h264 @ 0x55d956122340] mmco: unref short failure +[h264 @ 0x555ded7e1040] mmco: unref short failure +[h264 @ 0x55d95707f900] mmco: unref short failure +[h264 @ 0x55d955e4acc0] mmco: unref short failure +[h264 @ 0x555decb36640] mmco: unref short failure +[h264 @ 0x55d955e4acc0] mmco: unref short failure +[h264 @ 0x55d955e4acc0] mmco: unref short failure +[h264 @ 0x555decb36640] mmco: unref short failure +[h264 @ 0x555decb36640] mmco: unref short failure +[h264 @ 0x55d955e4acc0] mmco: unref short failure +[h264 @ 0x55d955e4acc0] mmco: unref short failure +[h264 @ 0x555decb36640] mmco: unref short failure +[h264 @ 0x555decb36640] mmco: unref short failure +[h264 @ 0x55d955e4acc0] mmco: unref short failure +[h264 @ 0x55d955e4acc0] mmco: unref short failure +[h264 @ 0x555decb36640] mmco: unref short failure +[h264 @ 0x555decb36640] mmco: unref short failure +[h264 @ 0x55d955e4acc0] mmco: unref short failure +[h264 @ 0x555decb36640] mmco: unref short failure +[h264 @ 0x55d955e4acc0] mmco: unref short failure +[h264 @ 0x555decb36640] mmco: unref short failure + [2024-11-28 14:50:35] iteration 998/ 1000 | consumed samples: 63872 | elapsed time per iteration (ms): 96048.8 | throughput per GPU (TFLOP/s/GPU): 80.3 | learning rate: 1.000514E-07 | global batch size: 64 | lm loss: 6.684725E-01 | loss scale: 1.0 | grad norm: 0.945 | number of skipped iterations: 0 | number of nan iterations: 0 | + [2024-11-28 14:52:26] iteration 999/ 1000 | consumed samples: 63936 | elapsed time per iteration (ms): 110316.3 | throughput per GPU (TFLOP/s/GPU): 69.9 | learning rate: 1.000128E-07 | global batch size: 64 | lm loss: 6.331837E-01 | loss scale: 1.0 | grad norm: 0.849 | number of skipped iterations: 0 | number of nan iterations: 0 | +[h264 @ 0x555df0f7cd00] mmco: unref short failure +[h264 @ 0x555df0f7cd00] mmco: unref short failure +[h264 @ 0x55d9559dfc40] mmco: unref short failure +[h264 @ 0x55d9559dfc40] mmco: unref short failure +[h264 @ 0x555ded679600] mmco: unref short failure +[h264 @ 0x55d9560aea40] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x55d956042040] mmco: unref short failure +[h264 @ 0x555df136f280] mmco: unref short failure +[h264 @ 0x55d956042040] mmco: unref short failure + [2024-11-28 14:53:45] iteration 1000/ 1000 | consumed samples: 64000 | elapsed time per iteration (ms): 79276.2 | throughput per GPU (TFLOP/s/GPU): 97.2 | learning rate: 1.000000E-07 | global batch size: 64 | lm loss: 7.160559E-01 | loss scale: 1.0 | grad norm: 1.057 | number of skipped iterations: 0 | number of nan iterations: 0 | +(min, max) time across ranks (ms): + save-checkpoint ................................: (274392.58, 274393.16) +[2024-11-28 15:04:24,072] torch.distributed.elastic.agent.server.api: [ERROR] Error waiting on exit barrier. Elapsed: 300.10606265068054 seconds ++ set +x