+ echo Logging output to /data_2/output/LM/scripts/modellink/qwen25/finetune_qwen25_14b_intern_300m_ptd_tp8pp1cp2_stage3.sh/20241127_204213//log_node0.txt
Logging output to /data_2/output/LM/scripts/modellink/qwen25/finetune_qwen25_14b_intern_300m_ptd_tp8pp1cp2_stage3.sh/20241127_204213//log_node0.txt
+ export ASCEND_PROCESS_LOG_PATH=/data_2/output/LM/scripts/modellink/qwen25/finetune_qwen25_14b_intern_300m_ptd_tp8pp1cp2_stage3.sh/20241127_204213//ascend/0
+ ASCEND_PROCESS_LOG_PATH=/data_2/output/LM/scripts/modellink/qwen25/finetune_qwen25_14b_intern_300m_ptd_tp8pp1cp2_stage3.sh/20241127_204213//ascend/0
+ mkdir -p /data_2/output/LM/scripts/modellink/qwen25/finetune_qwen25_14b_intern_300m_ptd_tp8pp1cp2_stage3.sh/20241127_204213//ascend/0
+ DATA_PATH=/local_disk/cognitron_vl//configs/lcvlm_finetune_stage3.yaml
+ TOKENIZER_PATH=/data_4/models/Qwen/Qwen2.5-14B-Instruct/
+ CKPT_LOAD_DIR=/data_2/output/LM/lcvlm_modellink/scripts/qwen25/finetune_qwen25_14b_intern_300m_ptd_tp8pp1_stage2.sh/20241014_131952/
+ VIT_CKPT_LOAD_DIR=/
+ CKPT_SAVE_DIR=/data_2/output/LM/scripts/modellink/qwen25/finetune_qwen25_14b_intern_300m_ptd_tp8pp1cp2_stage3.sh/20241127_204213//
+ rsync -avh /local_disk/cognitron_vl//configs/lcvlm_finetune_stage3.yaml /data_2/output/LM/scripts/modellink/qwen25/finetune_qwen25_14b_intern_300m_ptd_tp8pp1cp2_stage3.sh/20241127_204213/
sending incremental file list

sent 71 bytes  received 12 bytes  166.00 bytes/sec
total size is 23.84K  speedup is 287.17
+ cd /local_disk/cognitron_vl/
+ rm -fr datasets
+ mkdir -p datasets
+ ln -s /data/data/ datasets/CV
+ ln -s /data/data/LLM datasets/LLM
+ ln -s /data/data/LMM datasets/LMM
+ source /local_disk/cognitron_vl//scripts/set_env_mg_npu.sh
++ source /usr/local/Ascend/driver/bin/setenv.bash
+++ DEP_INFO_FILE=/etc/ascend_install.info
+++ [[ -f /etc/ascend_install.info ]]
+++ . /etc/ascend_install.info
+++ DRV_LIB64_COMMON_LDPATH=/driver/lib64/common
+++ DRV_LIB64_DRV_LDPATH=/driver/lib64/driver
+++ DRV_LIB64_LDPATH=/driver/lib64
+++ export LD_LIBRARY_PATH=/driver/lib64/common:/driver/lib64/driver:/driver/lib64:/usr/local/Ascend/ascend-toolkit/latest/tools/aml/lib64:/usr/local/Ascend/ascend-toolkit/latest/tools/aml/lib64/plugin:/usr/local/Ascend/ascend-toolkit/latest/lib64:/usr/local/Ascend/ascend-toolkit/latest/lib64/plugin/opskernel:/usr/local/Ascend/ascend-toolkit/latest/lib64/plugin/nnengine:/usr/local/Ascend/ascend-toolkit/latest/opp/built-in/op_impl/ai_core/tbe/op_tiling/lib/linux/x86_64:/usr/local/Ascend/driver/lib64:/usr/local/Ascend/driver/lib64/common:/usr/local/Ascend/driver/lib64/driver:/usr/local/Ascend/driver/lib64/common:/usr/local/Ascend/driver/lib64/driver:/usr/lib/x86_64-linux-gnu/hdf5/serial:
+++ LD_LIBRARY_PATH=/driver/lib64/common:/driver/lib64/driver:/driver/lib64:/usr/local/Ascend/ascend-toolkit/latest/tools/aml/lib64:/usr/local/Ascend/ascend-toolkit/latest/tools/aml/lib64/plugin:/usr/local/Ascend/ascend-toolkit/latest/lib64:/usr/local/Ascend/ascend-toolkit/latest/lib64/plugin/opskernel:/usr/local/Ascend/ascend-toolkit/latest/lib64/plugin/nnengine:/usr/local/Ascend/ascend-toolkit/latest/opp/built-in/op_impl/ai_core/tbe/op_tiling/lib/linux/x86_64:/usr/local/Ascend/driver/lib64:/usr/local/Ascend/driver/lib64/common:/usr/local/Ascend/driver/lib64/driver:/usr/local/Ascend/driver/lib64/common:/usr/local/Ascend/driver/lib64/driver:/usr/lib/x86_64-linux-gnu/hdf5/serial:
+++ export PATH=/usr/local/Ascend/ascend-toolkit/latest/bin:/usr/local/Ascend/ascend-toolkit/latest/compiler/ccec_compiler/bin:/usr/local/Ascend/ascend-toolkit/latest/tools/ccec_compiler/bin:/root/miniconda3/envs/py38/bin:/root/miniconda3/condabin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/usr/local/sbin:/usr/local/bin
+++ PATH=/usr/local/Ascend/ascend-toolkit/latest/bin:/usr/local/Ascend/ascend-toolkit/latest/compiler/ccec_compiler/bin:/usr/local/Ascend/ascend-toolkit/latest/tools/ccec_compiler/bin:/root/miniconda3/envs/py38/bin:/root/miniconda3/condabin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/usr/local/sbin:/usr/local/bin
++ source /usr/local/Ascend/ascend-toolkit/set_env.sh
+++ export LD_LIBRARY_PATH=/usr/local/Ascend/driver/lib64:/usr/local/Ascend/driver/lib64/common:/usr/local/Ascend/driver/lib64/driver:/driver/lib64/common:/driver/lib64/driver:/driver/lib64:/usr/local/Ascend/ascend-toolkit/latest/tools/aml/lib64:/usr/local/Ascend/ascend-toolkit/latest/tools/aml/lib64/plugin:/usr/local/Ascend/ascend-toolkit/latest/lib64:/usr/local/Ascend/ascend-toolkit/latest/lib64/plugin/opskernel:/usr/local/Ascend/ascend-toolkit/latest/lib64/plugin/nnengine:/usr/local/Ascend/ascend-toolkit/latest/opp/built-in/op_impl/ai_core/tbe/op_tiling/lib/linux/x86_64:/usr/local/Ascend/driver/lib64:/usr/local/Ascend/driver/lib64/common:/usr/local/Ascend/driver/lib64/driver:/usr/local/Ascend/driver/lib64/common:/usr/local/Ascend/driver/lib64/driver:/usr/lib/x86_64-linux-gnu/hdf5/serial:
+++ LD_LIBRARY_PATH=/usr/local/Ascend/driver/lib64:/usr/local/Ascend/driver/lib64/common:/usr/local/Ascend/driver/lib64/driver:/driver/lib64/common:/driver/lib64/driver:/driver/lib64:/usr/local/Ascend/ascend-toolkit/latest/tools/aml/lib64:/usr/local/Ascend/ascend-toolkit/latest/tools/aml/lib64/plugin:/usr/local/Ascend/ascend-toolkit/latest/lib64:/usr/local/Ascend/ascend-toolkit/latest/lib64/plugin/opskernel:/usr/local/Ascend/ascend-toolkit/latest/lib64/plugin/nnengine:/usr/local/Ascend/ascend-toolkit/latest/opp/built-in/op_impl/ai_core/tbe/op_tiling/lib/linux/x86_64:/usr/local/Ascend/driver/lib64:/usr/local/Ascend/driver/lib64/common:/usr/local/Ascend/driver/lib64/driver:/usr/local/Ascend/driver/lib64/common:/usr/local/Ascend/driver/lib64/driver:/usr/lib/x86_64-linux-gnu/hdf5/serial:
+++ export ASCEND_TOOLKIT_HOME=/usr/local/Ascend/ascend-toolkit/latest
+++ ASCEND_TOOLKIT_HOME=/usr/local/Ascend/ascend-toolkit/latest
++++ arch
+++ export LD_LIBRARY_PATH=/usr/local/Ascend/ascend-toolkit/latest/lib64:/usr/local/Ascend/ascend-toolkit/latest/lib64/plugin/opskernel:/usr/local/Ascend/ascend-toolkit/latest/lib64/plugin/nnengine:/usr/local/Ascend/ascend-toolkit/latest/opp/built-in/op_impl/ai_core/tbe/op_tiling/lib/linux/x86_64:/usr/local/Ascend/driver/lib64:/usr/local/Ascend/driver/lib64/common:/usr/local/Ascend/driver/lib64/driver:/driver/lib64/common:/driver/lib64/driver:/driver/lib64:/usr/local/Ascend/ascend-toolkit/latest/tools/aml/lib64:/usr/local/Ascend/ascend-toolkit/latest/tools/aml/lib64/plugin:/usr/local/Ascend/ascend-toolkit/latest/lib64:/usr/local/Ascend/ascend-toolkit/latest/lib64/plugin/opskernel:/usr/local/Ascend/ascend-toolkit/latest/lib64/plugin/nnengine:/usr/local/Ascend/ascend-toolkit/latest/opp/built-in/op_impl/ai_core/tbe/op_tiling/lib/linux/x86_64:/usr/local/Ascend/driver/lib64:/usr/local/Ascend/driver/lib64/common:/usr/local/Ascend/driver/lib64/driver:/usr/local/Ascend/driver/lib64/common:/usr/local/Ascend/driver/lib64/driver:/usr/lib/x86_64-linux-gnu/hdf5/serial:
+++ LD_LIBRARY_PATH=/usr/local/Ascend/ascend-toolkit/latest/lib64:/usr/local/Ascend/ascend-toolkit/latest/lib64/plugin/opskernel:/usr/local/Ascend/ascend-toolkit/latest/lib64/plugin/nnengine:/usr/local/Ascend/ascend-toolkit/latest/opp/built-in/op_impl/ai_core/tbe/op_tiling/lib/linux/x86_64:/usr/local/Ascend/driver/lib64:/usr/local/Ascend/driver/lib64/common:/usr/local/Ascend/driver/lib64/driver:/driver/lib64/common:/driver/lib64/driver:/driver/lib64:/usr/local/Ascend/ascend-toolkit/latest/tools/aml/lib64:/usr/local/Ascend/ascend-toolkit/latest/tools/aml/lib64/plugin:/usr/local/Ascend/ascend-toolkit/latest/lib64:/usr/local/Ascend/ascend-toolkit/latest/lib64/plugin/opskernel:/usr/local/Ascend/ascend-toolkit/latest/lib64/plugin/nnengine:/usr/local/Ascend/ascend-toolkit/latest/opp/built-in/op_impl/ai_core/tbe/op_tiling/lib/linux/x86_64:/usr/local/Ascend/driver/lib64:/usr/local/Ascend/driver/lib64/common:/usr/local/Ascend/driver/lib64/driver:/usr/local/Ascend/driver/lib64/common:/usr/local/Ascend/driver/lib64/driver:/usr/lib/x86_64-linux-gnu/hdf5/serial:
+++ export LD_LIBRARY_PATH=/usr/local/Ascend/ascend-toolkit/latest/tools/aml/lib64:/usr/local/Ascend/ascend-toolkit/latest/tools/aml/lib64/plugin:/usr/local/Ascend/ascend-toolkit/latest/lib64:/usr/local/Ascend/ascend-toolkit/latest/lib64/plugin/opskernel:/usr/local/Ascend/ascend-toolkit/latest/lib64/plugin/nnengine:/usr/local/Ascend/ascend-toolkit/latest/opp/built-in/op_impl/ai_core/tbe/op_tiling/lib/linux/x86_64:/usr/local/Ascend/driver/lib64:/usr/local/Ascend/driver/lib64/common:/usr/local/Ascend/driver/lib64/driver:/driver/lib64/common:/driver/lib64/driver:/driver/lib64:/usr/local/Ascend/ascend-toolkit/latest/tools/aml/lib64:/usr/local/Ascend/ascend-toolkit/latest/tools/aml/lib64/plugin:/usr/local/Ascend/ascend-toolkit/latest/lib64:/usr/local/Ascend/ascend-toolkit/latest/lib64/plugin/opskernel:/usr/local/Ascend/ascend-toolkit/latest/lib64/plugin/nnengine:/usr/local/Ascend/ascend-toolkit/latest/opp/built-in/op_impl/ai_core/tbe/op_tiling/lib/linux/x86_64:/usr/local/Ascend/driver/lib64:/usr/local/Ascend/driver/lib64/common:/usr/local/Ascend/driver/lib64/driver:/usr/local/Ascend/driver/lib64/common:/usr/local/Ascend/driver/lib64/driver:/usr/lib/x86_64-linux-gnu/hdf5/serial:
+++ LD_LIBRARY_PATH=/usr/local/Ascend/ascend-toolkit/latest/tools/aml/lib64:/usr/local/Ascend/ascend-toolkit/latest/tools/aml/lib64/plugin:/usr/local/Ascend/ascend-toolkit/latest/lib64:/usr/local/Ascend/ascend-toolkit/latest/lib64/plugin/opskernel:/usr/local/Ascend/ascend-toolkit/latest/lib64/plugin/nnengine:/usr/local/Ascend/ascend-toolkit/latest/opp/built-in/op_impl/ai_core/tbe/op_tiling/lib/linux/x86_64:/usr/local/Ascend/driver/lib64:/usr/local/Ascend/driver/lib64/common:/usr/local/Ascend/driver/lib64/driver:/driver/lib64/common:/driver/lib64/driver:/driver/lib64:/usr/local/Ascend/ascend-toolkit/latest/tools/aml/lib64:/usr/local/Ascend/ascend-toolkit/latest/tools/aml/lib64/plugin:/usr/local/Ascend/ascend-toolkit/latest/lib64:/usr/local/Ascend/ascend-toolkit/latest/lib64/plugin/opskernel:/usr/local/Ascend/ascend-toolkit/latest/lib64/plugin/nnengine:/usr/local/Ascend/ascend-toolkit/latest/opp/built-in/op_impl/ai_core/tbe/op_tiling/lib/linux/x86_64:/usr/local/Ascend/driver/lib64:/usr/local/Ascend/driver/lib64/common:/usr/local/Ascend/driver/lib64/driver:/usr/local/Ascend/driver/lib64/common:/usr/local/Ascend/driver/lib64/driver:/usr/lib/x86_64-linux-gnu/hdf5/serial:
+++ export PYTHONPATH=/usr/local/Ascend/ascend-toolkit/latest/python/site-packages:/usr/local/Ascend/ascend-toolkit/latest/opp/built-in/op_impl/ai_core/tbe:/usr/local/Ascend/ascend-toolkit/latest/python/site-packages:/usr/local/Ascend/ascend-toolkit/latest/opp/built-in/op_impl/ai_core/tbe:
+++ PYTHONPATH=/usr/local/Ascend/ascend-toolkit/latest/python/site-packages:/usr/local/Ascend/ascend-toolkit/latest/opp/built-in/op_impl/ai_core/tbe:/usr/local/Ascend/ascend-toolkit/latest/python/site-packages:/usr/local/Ascend/ascend-toolkit/latest/opp/built-in/op_impl/ai_core/tbe:
+++ export PATH=/usr/local/Ascend/ascend-toolkit/latest/bin:/usr/local/Ascend/ascend-toolkit/latest/compiler/ccec_compiler/bin:/usr/local/Ascend/ascend-toolkit/latest/tools/ccec_compiler/bin:/usr/local/Ascend/ascend-toolkit/latest/bin:/usr/local/Ascend/ascend-toolkit/latest/compiler/ccec_compiler/bin:/usr/local/Ascend/ascend-toolkit/latest/tools/ccec_compiler/bin:/root/miniconda3/envs/py38/bin:/root/miniconda3/condabin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/usr/local/sbin:/usr/local/bin
+++ PATH=/usr/local/Ascend/ascend-toolkit/latest/bin:/usr/local/Ascend/ascend-toolkit/latest/compiler/ccec_compiler/bin:/usr/local/Ascend/ascend-toolkit/latest/tools/ccec_compiler/bin:/usr/local/Ascend/ascend-toolkit/latest/bin:/usr/local/Ascend/ascend-toolkit/latest/compiler/ccec_compiler/bin:/usr/local/Ascend/ascend-toolkit/latest/tools/ccec_compiler/bin:/root/miniconda3/envs/py38/bin:/root/miniconda3/condabin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/usr/local/sbin:/usr/local/bin
+++ export ASCEND_AICPU_PATH=/usr/local/Ascend/ascend-toolkit/latest
+++ ASCEND_AICPU_PATH=/usr/local/Ascend/ascend-toolkit/latest
+++ export ASCEND_OPP_PATH=/usr/local/Ascend/ascend-toolkit/latest/opp
+++ ASCEND_OPP_PATH=/usr/local/Ascend/ascend-toolkit/latest/opp
+++ export TOOLCHAIN_HOME=/usr/local/Ascend/ascend-toolkit/latest/toolkit
+++ TOOLCHAIN_HOME=/usr/local/Ascend/ascend-toolkit/latest/toolkit
+++ export ASCEND_HOME_PATH=/usr/local/Ascend/ascend-toolkit/latest
+++ ASCEND_HOME_PATH=/usr/local/Ascend/ascend-toolkit/latest
++ export HCCL_CONNECT_TIMEOUT=7200
++ HCCL_CONNECT_TIMEOUT=7200
++ export HCCL_EXEC_TIMEOUT=7200
++ HCCL_EXEC_TIMEOUT=7200
++ export COMBINED_ENABLE=1
++ COMBINED_ENABLE=1
++ export MULTI_STREAM_MEMORY_REUSE=1
++ MULTI_STREAM_MEMORY_REUSE=1
++ export HCCL_RDMA_TC=160
++ HCCL_RDMA_TC=160
++ export HCCL_RDMA_SL=5
++ HCCL_RDMA_SL=5
++ export HCCL_INTRA_PCIE_ENABLE=0
++ HCCL_INTRA_PCIE_ENABLE=0
++ export HCCL_INTRA_ROCE_ENABLE=1
++ HCCL_INTRA_ROCE_ENABLE=1
++ export HCCL_RDMA_TIMEOUT=20
++ HCCL_RDMA_TIMEOUT=20
++ export INF_NAN_MODE_ENABLE=1
++ INF_NAN_MODE_ENABLE=1
++ export DISTRIBUTED_BACKEND=hccl
++ DISTRIBUTED_BACKEND=hccl
++ export ASCEND_LAUNCH_BLOCKING=0
++ ASCEND_LAUNCH_BLOCKING=0
++ export ASCEND_SLOG_PRINT_TO_STDOUT=0
++ ASCEND_SLOG_PRINT_TO_STDOUT=0
++ export ASCEND_GLOBAL_LOG_LEVEL=3
++ ASCEND_GLOBAL_LOG_LEVEL=3
++ export ASCEND_GLOBAL_EVENT_ENABLE=0
++ ASCEND_GLOBAL_EVENT_ENABLE=0
++ export TASK_QUEUE_ENABLE=1
++ TASK_QUEUE_ENABLE=1
++ export PTCOPY_ENABLE=1
++ PTCOPY_ENABLE=1
++ export COMBINED_ENABLE=1
++ COMBINED_ENABLE=1
++ export DYNAMIC_OP=ADD#MUL
++ DYNAMIC_OP=ADD#MUL
++ export HCCL_WHITELIST_DISABLE=1
++ HCCL_WHITELIST_DISABLE=1
++ export HCCL_CONNECT_TIMEOUT=7200
++ HCCL_CONNECT_TIMEOUT=7200
++ export HCCL_WHITELIST_DISABLE=1
++ HCCL_WHITELIST_DISABLE=1
++ export CUDA_DEVICE_MAX_CONNECTIONS=1
++ CUDA_DEVICE_MAX_CONNECTIONS=1
++ pip3 install --no-index --find-links=/data/software/ -r requirements_npu.txt
Looking in links: /data/software/
Processing data/software/expecttest-0.2.1-py3-none-any.whl (from -r requirements_npu.txt (line 1))
Requirement already satisfied: peft in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from -r requirements_npu.txt (line 2)) (0.7.0)
Processing data/software/XlsxWriter-3.2.0-py3-none-any.whl (from -r requirements_npu.txt (line 3))
Requirement already satisfied: termcolor in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from -r requirements_npu.txt (line 4)) (2.4.0)
Requirement already satisfied: tabulate in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from -r requirements_npu.txt (line 5)) (0.9.0)
Processing data/software/tiktoken-0.7.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (from -r requirements_npu.txt (line 6))
Requirement already satisfied: matplotlib in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from -r requirements_npu.txt (line 7)) (3.7.5)
Processing data/software/datasets-3.0.0-py3-none-any.whl (from -r requirements_npu.txt (line 8))
Requirement already satisfied: einops in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from -r requirements_npu.txt (line 9)) (0.7.0)
Processing data/software/pybind11-2.13.6-py3-none-any.whl (from -r requirements_npu.txt (line 10))
Requirement already satisfied: tensorboardX in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from -r requirements_npu.txt (line 11)) (2.6.2.2)
Processing data/software/pyarrow-17.0.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (from -r requirements_npu.txt (line 12))
Requirement already satisfied: transformers>=4.40.1 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from -r requirements_npu.txt (line 13)) (4.40.1)
Requirement already satisfied: deepspeed>=0.14.2 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from -r requirements_npu.txt (line 14)) (0.14.5)
Processing data/software/accelerate-0.34.2-py3-none-any.whl (from -r requirements_npu.txt (line 15))
Requirement already satisfied: timm in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from -r requirements_npu.txt (line 16)) (0.9.16)
Processing data/software/flask-3.0.3-py3-none-any.whl (from -r requirements_npu.txt (line 17))
Processing data/software/Flask_RESTful-0.3.10-py2.py3-none-any.whl (from -r requirements_npu.txt (line 18))
Processing data/software/decord-0.6.0-py3-none-manylinux2010_x86_64.whl (from -r requirements_npu.txt (line 19))
Processing data/software/natsort-8.4.0-py3-none-any.whl (from -r requirements_npu.txt (line 20))
Requirement already satisfied: numpy>=1.17 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from peft->-r requirements_npu.txt (line 2)) (1.24.4)
Requirement already satisfied: packaging>=20.0 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from peft->-r requirements_npu.txt (line 2)) (23.2)
Requirement already satisfied: psutil in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from peft->-r requirements_npu.txt (line 2)) (5.9.8)
Requirement already satisfied: pyyaml in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from peft->-r requirements_npu.txt (line 2)) (5.4.1)
Requirement already satisfied: torch>=1.13.0 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from peft->-r requirements_npu.txt (line 2)) (2.1.0+cpu)
Requirement already satisfied: tqdm in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from peft->-r requirements_npu.txt (line 2)) (4.66.2)
Requirement already satisfied: safetensors in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from peft->-r requirements_npu.txt (line 2)) (0.4.2)
Requirement already satisfied: huggingface-hub>=0.17.0 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from peft->-r requirements_npu.txt (line 2)) (0.20.3)
Requirement already satisfied: regex>=2022.1.18 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from tiktoken->-r requirements_npu.txt (line 6)) (2023.12.25)
Requirement already satisfied: requests>=2.26.0 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from tiktoken->-r requirements_npu.txt (line 6)) (2.31.0)
Requirement already satisfied: contourpy>=1.0.1 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from matplotlib->-r requirements_npu.txt (line 7)) (1.1.1)
Requirement already satisfied: cycler>=0.10 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from matplotlib->-r requirements_npu.txt (line 7)) (0.12.1)
Requirement already satisfied: fonttools>=4.22.0 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from matplotlib->-r requirements_npu.txt (line 7)) (4.49.0)
Requirement already satisfied: kiwisolver>=1.0.1 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from matplotlib->-r requirements_npu.txt (line 7)) (1.4.5)
Requirement already satisfied: pillow>=6.2.0 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from matplotlib->-r requirements_npu.txt (line 7)) (10.2.0)
Requirement already satisfied: pyparsing>=2.3.1 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from matplotlib->-r requirements_npu.txt (line 7)) (3.1.1)
Requirement already satisfied: python-dateutil>=2.7 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from matplotlib->-r requirements_npu.txt (line 7)) (2.8.2)
Requirement already satisfied: importlib-resources>=3.2.0 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from matplotlib->-r requirements_npu.txt (line 7)) (6.1.2)
Requirement already satisfied: filelock in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from datasets>=2.21.0->-r requirements_npu.txt (line 8)) (3.13.1)
Requirement already satisfied: dill<0.3.9,>=0.3.0 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from datasets>=2.21.0->-r requirements_npu.txt (line 8)) (0.3.7)
Requirement already satisfied: pandas in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from datasets>=2.21.0->-r requirements_npu.txt (line 8)) (2.0.3)
Processing data/software/requests-2.32.3-py3-none-any.whl (from tiktoken->-r requirements_npu.txt (line 6))
Processing data/software/tqdm-4.67.1-py3-none-any.whl (from peft->-r requirements_npu.txt (line 2))
Requirement already satisfied: xxhash in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from datasets>=2.21.0->-r requirements_npu.txt (line 8)) (3.4.1)
Requirement already satisfied: multiprocess in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from datasets>=2.21.0->-r requirements_npu.txt (line 8)) (0.70.15)
Requirement already satisfied: fsspec<=2024.6.1,>=2023.1.0 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from fsspec[http]<=2024.6.1,>=2023.1.0->datasets>=2.21.0->-r requirements_npu.txt (line 8)) (2023.10.0)
Requirement already satisfied: aiohttp in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from datasets>=2.21.0->-r requirements_npu.txt (line 8)) (3.9.3)
Processing data/software/huggingface_hub-0.26.2-py3-none-any.whl (from peft->-r requirements_npu.txt (line 2))
Requirement already satisfied: protobuf>=3.20 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from tensorboardX->-r requirements_npu.txt (line 11)) (4.25.3)
Requirement already satisfied: tokenizers<0.20,>=0.19 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from transformers>=4.40.1->-r requirements_npu.txt (line 13)) (0.19.1)
Requirement already satisfied: hjson in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from deepspeed>=0.14.2->-r requirements_npu.txt (line 14)) (3.1.0)
Requirement already satisfied: ninja in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from deepspeed>=0.14.2->-r requirements_npu.txt (line 14)) (1.11.1.1)
Requirement already satisfied: nvidia-ml-py in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from deepspeed>=0.14.2->-r requirements_npu.txt (line 14)) (12.560.30)
Requirement already satisfied: py-cpuinfo in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from deepspeed>=0.14.2->-r requirements_npu.txt (line 14)) (9.0.0)
Requirement already satisfied: pydantic in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from deepspeed>=0.14.2->-r requirements_npu.txt (line 14)) (1.10.15)
Processing data/software/safetensors-0.4.5-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (from peft->-r requirements_npu.txt (line 2))
Requirement already satisfied: torchvision in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from timm->-r requirements_npu.txt (line 16)) (0.16.0)
Requirement already satisfied: Werkzeug>=3.0.0 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from flask->-r requirements_npu.txt (line 17)) (3.0.1)
Requirement already satisfied: Jinja2>=3.1.2 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from flask->-r requirements_npu.txt (line 17)) (3.1.3)
Processing data/software/itsdangerous-2.2.0-py3-none-any.whl (from flask->-r requirements_npu.txt (line 17))
Requirement already satisfied: click>=8.1.3 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from flask->-r requirements_npu.txt (line 17)) (8.1.7)
Processing data/software/blinker-1.8.2-py3-none-any.whl (from flask->-r requirements_npu.txt (line 17))
Requirement already satisfied: importlib-metadata>=3.6.0 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from flask->-r requirements_npu.txt (line 17)) (7.0.1)
Processing data/software/aniso8601-9.0.1-py2.py3-none-any.whl (from flask_restful->-r requirements_npu.txt (line 18))
Requirement already satisfied: six>=1.3.0 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from flask_restful->-r requirements_npu.txt (line 18)) (1.16.0)
Requirement already satisfied: pytz in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from flask_restful->-r requirements_npu.txt (line 18)) (2024.1)
Requirement already satisfied: aiosignal>=1.1.2 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from aiohttp->datasets>=2.21.0->-r requirements_npu.txt (line 8)) (1.3.1)
Requirement already satisfied: attrs>=17.3.0 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from aiohttp->datasets>=2.21.0->-r requirements_npu.txt (line 8)) (23.2.0)
Requirement already satisfied: frozenlist>=1.1.1 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from aiohttp->datasets>=2.21.0->-r requirements_npu.txt (line 8)) (1.4.1)
Requirement already satisfied: multidict<7.0,>=4.5 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from aiohttp->datasets>=2.21.0->-r requirements_npu.txt (line 8)) (6.0.5)
Requirement already satisfied: yarl<2.0,>=1.0 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from aiohttp->datasets>=2.21.0->-r requirements_npu.txt (line 8)) (1.9.4)
Requirement already satisfied: async-timeout<5.0,>=4.0 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from aiohttp->datasets>=2.21.0->-r requirements_npu.txt (line 8)) (4.0.3)
Requirement already satisfied: typing-extensions>=3.7.4.3 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from huggingface-hub>=0.17.0->peft->-r requirements_npu.txt (line 2)) (4.10.0)
Requirement already satisfied: zipp>=0.5 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from importlib-metadata>=3.6.0->flask->-r requirements_npu.txt (line 17)) (3.17.0)
Requirement already satisfied: MarkupSafe>=2.0 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from Jinja2>=3.1.2->flask->-r requirements_npu.txt (line 17)) (2.1.5)
Requirement already satisfied: charset-normalizer<4,>=2 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from requests>=2.26.0->tiktoken->-r requirements_npu.txt (line 6)) (3.3.2)
Requirement already satisfied: idna<4,>=2.5 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from requests>=2.26.0->tiktoken->-r requirements_npu.txt (line 6)) (3.6)
Requirement already satisfied: urllib3<3,>=1.21.1 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from requests>=2.26.0->tiktoken->-r requirements_npu.txt (line 6)) (1.26.18)
Requirement already satisfied: certifi>=2017.4.17 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from requests>=2.26.0->tiktoken->-r requirements_npu.txt (line 6)) (2024.2.2)
Requirement already satisfied: sympy in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from torch>=1.13.0->peft->-r requirements_npu.txt (line 2)) (1.4)
Requirement already satisfied: networkx in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from torch>=1.13.0->peft->-r requirements_npu.txt (line 2)) (3.1)
Requirement already satisfied: tzdata>=2022.1 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from pandas->datasets>=2.21.0->-r requirements_npu.txt (line 8)) (2024.1)
Requirement already satisfied: mpmath>=0.19 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from sympy->torch>=1.13.0->peft->-r requirements_npu.txt (line 2)) (1.3.0)
DEPRECATION: apex 0.1-ascend-20240523 has a non-standard version number. pip 24.1 will enforce this behaviour change. A possible replacement is to upgrade to a newer version of apex or contact the author to suggest that they release a version with a conforming version number. Discussion can be found at https://github.com/pypa/pip/issues/12063
Installing collected packages: aniso8601, xlsxwriter, tqdm, safetensors, requests, pybind11, pyarrow, natsort, itsdangerous, expecttest, decord, blinker, tiktoken, huggingface-hub, flask, flask_restful, accelerate, datasets
  Attempting uninstall: tqdm
    Found existing installation: tqdm 4.66.2
    Uninstalling tqdm-4.66.2:
      Successfully uninstalled tqdm-4.66.2
  Attempting uninstall: safetensors
    Found existing installation: safetensors 0.4.2
    Uninstalling safetensors-0.4.2:
      Successfully uninstalled safetensors-0.4.2
  Attempting uninstall: requests
    Found existing installation: requests 2.31.0
    Uninstalling requests-2.31.0:
      Successfully uninstalled requests-2.31.0
  Attempting uninstall: pyarrow
    Found existing installation: pyarrow 15.0.0
    Uninstalling pyarrow-15.0.0:
      Successfully uninstalled pyarrow-15.0.0
  Attempting uninstall: huggingface-hub
    Found existing installation: huggingface-hub 0.20.3
    Uninstalling huggingface-hub-0.20.3:
      Successfully uninstalled huggingface-hub-0.20.3
  Attempting uninstall: accelerate
    Found existing installation: accelerate 0.25.0
    Uninstalling accelerate-0.25.0:
      Successfully uninstalled accelerate-0.25.0
  Attempting uninstall: datasets
    Found existing installation: datasets 2.16.0
    Uninstalling datasets-2.16.0:
      Successfully uninstalled datasets-2.16.0
ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
tikit 1.8.2.240926 requires dicttoxml==1.7.4, which is not installed.
tikit 1.8.2.240926 requires docopt==0.6.2, which is not installed.
tikit 1.8.2.240926 requires future==0.18.2, which is not installed.
tikit 1.8.2.240926 requires hdfs==2.6.0, which is not installed.
tikit 1.8.2.240926 requires pure-sasl==0.6.2, which is not installed.
tikit 1.8.2.240926 requires py4j==0.10.7, which is not installed.
tikit 1.8.2.240926 requires PyHive[hive]==0.6.4, which is not installed.
tikit 1.8.2.240926 requires pyjwt>=2.4.0, which is not installed.
tikit 1.8.2.240926 requires requests-kerberos>=0.14.0, which is not installed.
tikit 1.8.2.240926 requires sasl==0.3.1, which is not installed.
tikit 1.8.2.240926 requires thrift==0.15.0, which is not installed.
tikit 1.8.2.240926 requires thrift-sasl>=0.1.0, which is not installed.
tikit 1.8.2.240926 requires certifi==2021.10.8, but you have certifi 2024.2.2 which is incompatible.
tikit 1.8.2.240926 requires cos-python-sdk-v5==1.9.29, but you have cos-python-sdk-v5 1.9.26 which is incompatible.
tikit 1.8.2.240926 requires idna==3.3, but you have idna 3.6 which is incompatible.
tikit 1.8.2.240926 requires prettytable==2.5.0, but you have prettytable 3.11.0 which is incompatible.
tikit 1.8.2.240926 requires urllib3==1.26.7, but you have urllib3 1.26.18 which is incompatible.
tikit 1.8.2.240926 requires wcwidth==0.2.5, but you have wcwidth 0.2.13 which is incompatible.
Successfully installed accelerate-0.34.2 aniso8601-9.0.1 blinker-1.8.2 datasets-3.0.0 decord-0.6.0 expecttest-0.2.1 flask-3.0.3 flask_restful-0.3.10 huggingface-hub-0.26.2 itsdangerous-2.2.0 natsort-8.4.0 pyarrow-17.0.0 pybind11-2.13.6 requests-2.32.3 safetensors-0.4.5 tiktoken-0.7.0 tqdm-4.67.1 xlsxwriter-3.2.0
WARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv
++ return 0
+ MEGATRON_DIR=/local_disk/cognitron_vl//third_party/Megatron-LM_core_r0.6.0/
+ MINDSPEED_DIR=/local_disk/cognitron_vl//third_party/MindSpeed_core_r0.6.0/
+ MODELLINK_DIR=/local_disk/cognitron_vl//third_party/ModelLink/
+ pip3 install --no-index --find-links=/data/software/ -e /local_disk/cognitron_vl//third_party/Megatron-LM_core_r0.6.0/
Looking in links: /data/software/
Obtaining file://local_disk/cognitron_vl/third_party/Megatron-LM_core_r0.6.0
  Installing build dependencies: started
  Installing build dependencies: finished with status 'done'
  Checking if build backend supports build_editable: started
  Checking if build backend supports build_editable: finished with status 'done'
  Getting requirements to build editable: started
  Getting requirements to build editable: finished with status 'done'
  Installing backend dependencies: started
  Installing backend dependencies: finished with status 'done'
  Preparing editable metadata (pyproject.toml): started
  Preparing editable metadata (pyproject.toml): finished with status 'done'
Building wheels for collected packages: megatron_core
  Building editable for megatron_core (pyproject.toml): started
  Building editable for megatron_core (pyproject.toml): finished with status 'done'
  Created wheel for megatron_core: filename=megatron_core-0.6.0-0.editable-cp38-cp38-linux_x86_64.whl size=8791 sha256=06d5bd071b6eadb2bc6965a495bd802172dae415af74dd60b1478328d6910bcd
  Stored in directory: /tmp/pip-ephem-wheel-cache-m7mtfhay/wheels/54/9c/d1/d2015aa0c34e791e64d65d19395e5a9a5528f0c63fd519b9ff
Successfully built megatron_core
DEPRECATION: apex 0.1-ascend-20240523 has a non-standard version number. pip 24.1 will enforce this behaviour change. A possible replacement is to upgrade to a newer version of apex or contact the author to suggest that they release a version with a conforming version number. Discussion can be found at https://github.com/pypa/pip/issues/12063
Installing collected packages: megatron_core
Successfully installed megatron_core-0.6.0
WARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv
+ pip3 install --no-index --find-links=/data/software/ -e /local_disk/cognitron_vl//third_party/MindSpeed_core_r0.6.0/
Looking in links: /data/software/
Obtaining file://local_disk/cognitron_vl/third_party/MindSpeed_core_r0.6.0
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
WARNING: Error parsing requirements for tokenizers: [Errno 2] No such file or directory: '/root/miniconda3/envs/py38/lib/python3.8/site-packages/tokenizers-0.19.1.dist-info/METADATA'
WARNING: Error parsing requirements for transformers: [Errno 2] No such file or directory: '/root/miniconda3/envs/py38/lib/python3.8/site-packages/transformers-4.40.1.dist-info/METADATA'
DEPRECATION: apex 0.1-ascend-20240523 has a non-standard version number. pip 24.1 will enforce this behaviour change. A possible replacement is to upgrade to a newer version of apex or contact the author to suggest that they release a version with a conforming version number. Discussion can be found at https://github.com/pypa/pip/issues/12063
Installing collected packages: mindspeed
  Running setup.py develop for mindspeed
Successfully installed mindspeed-0.6.0
WARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv
+ pip3 install --no-index --find-links=/data/software/ -e /local_disk/cognitron_vl//third_party/ModelLink/
Looking in links: /data/software/
Obtaining file://local_disk/cognitron_vl/third_party/ModelLink
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Requirement already satisfied: numpy in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from modellink==0.0.1) (1.24.4)
Processing data/software/transformers-4.43.2-py3-none-any.whl (from modellink==0.0.1)
Processing data/software/transformers-stream-generator-0.0.5.tar.gz (from modellink==0.0.1)
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Requirement already satisfied: sympy in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from modellink==0.0.1) (1.4)
Requirement already satisfied: decorator in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from modellink==0.0.1) (5.1.1)
Requirement already satisfied: scipy in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from modellink==0.0.1) (1.10.1)
Requirement already satisfied: sentencepiece in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from modellink==0.0.1) (0.2.0)
Requirement already satisfied: einops in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from modellink==0.0.1) (0.7.0)
Requirement already satisfied: datasets in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from modellink==0.0.1) (3.0.0)
Requirement already satisfied: pybind11 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from modellink==0.0.1) (2.13.6)
Requirement already satisfied: accelerate in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from modellink==0.0.1) (0.34.2)
Requirement already satisfied: six in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from modellink==0.0.1) (1.16.0)
Requirement already satisfied: protobuf in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from modellink==0.0.1) (4.25.3)
Processing data/software/peft-0.7.1-py3-none-any.whl (from modellink==0.0.1)
Requirement already satisfied: tiktoken in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from modellink==0.0.1) (0.7.0)
Requirement already satisfied: packaging>=20.0 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from peft==0.7.1->modellink==0.0.1) (23.2)
Requirement already satisfied: psutil in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from peft==0.7.1->modellink==0.0.1) (5.9.8)
Requirement already satisfied: pyyaml in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from peft==0.7.1->modellink==0.0.1) (5.4.1)
Requirement already satisfied: torch>=1.13.0 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from peft==0.7.1->modellink==0.0.1) (2.1.0+cpu)
Requirement already satisfied: tqdm in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from peft==0.7.1->modellink==0.0.1) (4.67.1)
Requirement already satisfied: safetensors in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from peft==0.7.1->modellink==0.0.1) (0.4.5)
Requirement already satisfied: huggingface-hub>=0.17.0 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from peft==0.7.1->modellink==0.0.1) (0.26.2)
Requirement already satisfied: filelock in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from transformers==4.43.2->modellink==0.0.1) (3.13.1)
Requirement already satisfied: regex!=2019.12.17 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from transformers==4.43.2->modellink==0.0.1) (2023.12.25)
Requirement already satisfied: requests in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from transformers==4.43.2->modellink==0.0.1) (2.32.3)
Processing data/software/tokenizers-0.19.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (from transformers==4.43.2->modellink==0.0.1)
Requirement already satisfied: pyarrow>=15.0.0 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from datasets->modellink==0.0.1) (17.0.0)
Requirement already satisfied: dill<0.3.9,>=0.3.0 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from datasets->modellink==0.0.1) (0.3.7)
Requirement already satisfied: pandas in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from datasets->modellink==0.0.1) (2.0.3)
Requirement already satisfied: xxhash in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from datasets->modellink==0.0.1) (3.4.1)
Requirement already satisfied: multiprocess in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from datasets->modellink==0.0.1) (0.70.15)
Requirement already satisfied: fsspec<=2024.6.1,>=2023.1.0 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from fsspec[http]<=2024.6.1,>=2023.1.0->datasets->modellink==0.0.1) (2023.10.0)
Requirement already satisfied: aiohttp in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from datasets->modellink==0.0.1) (3.9.3)
Requirement already satisfied: mpmath>=0.19 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from sympy->modellink==0.0.1) (1.3.0)
Requirement already satisfied: aiosignal>=1.1.2 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from aiohttp->datasets->modellink==0.0.1) (1.3.1)
Requirement already satisfied: attrs>=17.3.0 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from aiohttp->datasets->modellink==0.0.1) (23.2.0)
Requirement already satisfied: frozenlist>=1.1.1 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from aiohttp->datasets->modellink==0.0.1) (1.4.1)
Requirement already satisfied: multidict<7.0,>=4.5 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from aiohttp->datasets->modellink==0.0.1) (6.0.5)
Requirement already satisfied: yarl<2.0,>=1.0 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from aiohttp->datasets->modellink==0.0.1) (1.9.4)
Requirement already satisfied: async-timeout<5.0,>=4.0 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from aiohttp->datasets->modellink==0.0.1) (4.0.3)
Requirement already satisfied: typing-extensions>=3.7.4.3 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from huggingface-hub>=0.17.0->peft==0.7.1->modellink==0.0.1) (4.10.0)
Requirement already satisfied: charset-normalizer<4,>=2 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from requests->transformers==4.43.2->modellink==0.0.1) (3.3.2)
Requirement already satisfied: idna<4,>=2.5 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from requests->transformers==4.43.2->modellink==0.0.1) (3.6)
Requirement already satisfied: urllib3<3,>=1.21.1 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from requests->transformers==4.43.2->modellink==0.0.1) (1.26.18)
Requirement already satisfied: certifi>=2017.4.17 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from requests->transformers==4.43.2->modellink==0.0.1) (2024.2.2)
Requirement already satisfied: networkx in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from torch>=1.13.0->peft==0.7.1->modellink==0.0.1) (3.1)
Requirement already satisfied: jinja2 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from torch>=1.13.0->peft==0.7.1->modellink==0.0.1) (3.1.3)
Requirement already satisfied: python-dateutil>=2.8.2 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from pandas->datasets->modellink==0.0.1) (2.8.2)
Requirement already satisfied: pytz>=2020.1 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from pandas->datasets->modellink==0.0.1) (2024.1)
Requirement already satisfied: tzdata>=2022.1 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from pandas->datasets->modellink==0.0.1) (2024.1)
Requirement already satisfied: MarkupSafe>=2.0 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from jinja2->torch>=1.13.0->peft==0.7.1->modellink==0.0.1) (2.1.5)
Building wheels for collected packages: transformers_stream_generator
  Building wheel for transformers_stream_generator (setup.py): started
  Building wheel for transformers_stream_generator (setup.py): finished with status 'done'
  Created wheel for transformers_stream_generator: filename=transformers_stream_generator-0.0.5-py3-none-any.whl size=12425 sha256=3ed62a866ab10917ceed94a0bafc0596380802f798ed67b7de78b76fe0b65f1f
  Stored in directory: /root/.cache/pip/wheels/56/8c/42/5381d9c36bc85f28982f4cf8f98dc44d37a6d6c04897a5cb7c
Successfully built transformers_stream_generator
DEPRECATION: apex 0.1-ascend-20240523 has a non-standard version number. pip 24.1 will enforce this behaviour change. A possible replacement is to upgrade to a newer version of apex or contact the author to suggest that they release a version with a conforming version number. Discussion can be found at https://github.com/pypa/pip/issues/12063
Installing collected packages: tokenizers, transformers, transformers_stream_generator, peft, modellink
  Attempting uninstall: tokenizers
    Found existing installation: tokenizers 0.20.3
    Uninstalling tokenizers-0.20.3:
      Successfully uninstalled tokenizers-0.20.3
  Attempting uninstall: transformers
    Found existing installation: transformers 4.46.3
    Uninstalling transformers-4.46.3:
      Successfully uninstalled transformers-4.46.3
  Attempting uninstall: peft
    Found existing installation: peft 0.7.0
    Uninstalling peft-0.7.0:
      Successfully uninstalled peft-0.7.0
  Running setup.py develop for modellink
Successfully installed modellink-0.0.1 peft-0.7.1 tokenizers-0.19.1 transformers-4.43.2 transformers_stream_generator-0.0.5
WARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv
+ export PYTHONPATH=/local_disk/cognitron_vl//third_party/Megatron-LM_core_r0.6.0//:/usr/local/Ascend/ascend-toolkit/latest/python/site-packages:/usr/local/Ascend/ascend-toolkit/latest/opp/built-in/op_impl/ai_core/tbe:/usr/local/Ascend/ascend-toolkit/latest/python/site-packages:/usr/local/Ascend/ascend-toolkit/latest/opp/built-in/op_impl/ai_core/tbe:
+ PYTHONPATH=/local_disk/cognitron_vl//third_party/Megatron-LM_core_r0.6.0//:/usr/local/Ascend/ascend-toolkit/latest/python/site-packages:/usr/local/Ascend/ascend-toolkit/latest/opp/built-in/op_impl/ai_core/tbe:/usr/local/Ascend/ascend-toolkit/latest/python/site-packages:/usr/local/Ascend/ascend-toolkit/latest/opp/built-in/op_impl/ai_core/tbe:
+ GPUS_PER_NODE=16
+ NNODES=32
+ NODE_RANK=0
+ MASTER_PORT=34567
+ export CUDA_DEVICE_MAX_CONNECTIONS=1
+ CUDA_DEVICE_MAX_CONNECTIONS=1
+ VISION_SEQ_LENGTH=1025
+ IMAGE_TOKEN_LENGTH=256
+ IMAGE_SIZE=448
+ VISION_MODEL_TYPE=intern_300m
+ TP=8
+ PP=1
+ CP=2
+ CP_ALGO=megatron_cp_algo
+ CP_MASK=causal
+ DISTRIBUTED_ARGS='
    --nproc_per_node 16     --nnodes 32     --node_rank 0     --master_addr train-1197954740059955456-93njiyzl9b0g-master-0.train-100034032793.svc.cluster.local     --master_port 34567
'
+ GPT_ARGS='
    --use-mcore-models     --tensor-model-parallel-size 8     --pipeline-model-parallel-size 1     --context-parallel-size 2     --context-parallel-algo megatron_cp_algo     --cp-attention-mask-type causal     --use-cp-send-recv-overlap     --no-create-attention-mask-in-dataloader     --sparse-mode 4     --sequence-parallel     --recompute-method block     --recompute-granularity full     --recompute-num-layers 48     --num-layers 48     --hidden-size 5120     --ffn-hidden-size 13824     --num-attention-heads 40     --group-query-attention     --num-query-groups 8     --tokenizer-type PretrainedFromHF     --tokenizer-name-or-path /data_4/models/Qwen/Qwen2.5-14B-Instruct/     --seq-length 131072     --max-position-embeddings 131072     --micro-batch-size 1     --global-batch-size 64     --make-vocab-size-divisible-by 1     --padded-vocab-size 152064     --rotary-base 1000000.0     --lr 5.00e-6     --train-iters 1000     --lr-decay-style cosine     --untie-embeddings-and-output-weights     --disable-bias-linear     --attention-dropout 0.0     --init-method-std 0.01     --hidden-dropout 0.0     --position-embedding-type rope     --normalization RMSNorm     --use-fused-rmsnorm     --norm-epsilon 1e-6     --swiglu     --use-flash-attn     --use-fused-rotary-pos-emb     --use-rotary-position-embeddings     --use-fused-swiglu     --use-mc2     --no-masked-softmax-fusion     --attention-softmax-in-fp32     --min-lr 1.00e-7     --weight-decay 0.0     --lr-warmup-fraction 0.03     --clip-grad 1.0     --adam-beta1 0.9     --adam-beta2 0.999     --add-qkv-bias     --initial-loss-scale 4096     --no-gradient-accumulation-fusion     --use-distributed-optimizer     --bf16     --overlap-grad-reduce     --finetune     --vision-model-freeze     --vision-model-type intern_300m     --vision-downsample-ratio 0.5     --vision-projector-type mlp     --vision-projector-pre-norm     --vision-process-type dynamic     --vision-normalize-type imagenet     --vision-seq-length 1025     --image-token-length 256     --image-size 448     --prompt-format qwen2     --is-instruction-dataset     --max-num-frame 512     --max-fps 1     --add-class-token     --min-patch-grid 1     --max-patch-grid 12     --cross-dataset-joint '
+ DATA_ARGS='
    --data-path /local_disk/cognitron_vl//configs/lcvlm_finetune_stage3.yaml     --split 100,0,0     --data-seq-length 131072     --num-workers 8 '
+ CKPT_ARGS='
    --load /data_2/output/LM/lcvlm_modellink/scripts/qwen25/finetune_qwen25_14b_intern_300m_ptd_tp8pp1_stage2.sh/20241014_131952/     --vit-load /     --no-load-optim     --no-load-rng     --seed 424242     --save /data_2/output/LM/scripts/modellink/qwen25/finetune_qwen25_14b_intern_300m_ptd_tp8pp1cp2_stage3.sh/20241127_204213// '
+ OUTPUT_ARGS='
    --log-interval 1     --save-interval 100     --eval-interval 100     --eval-iters 0     --log-throughput     --distributed-timeout-minutes 120 '
+ torchrun --nproc_per_node 16 --nnodes 32 --node_rank 0 --master_addr train-1197954740059955456-93njiyzl9b0g-master-0.train-100034032793.svc.cluster.local --master_port 34567 /local_disk/cognitron_vl//lcvlm_modellink/pretrain_lcvlm.py --use-mcore-models --tensor-model-parallel-size 8 --pipeline-model-parallel-size 1 --context-parallel-size 2 --context-parallel-algo megatron_cp_algo --cp-attention-mask-type causal --use-cp-send-recv-overlap --no-create-attention-mask-in-dataloader --sparse-mode 4 --sequence-parallel --recompute-method block --recompute-granularity full --recompute-num-layers 48 --num-layers 48 --hidden-size 5120 --ffn-hidden-size 13824 --num-attention-heads 40 --group-query-attention --num-query-groups 8 --tokenizer-type PretrainedFromHF --tokenizer-name-or-path /data_4/models/Qwen/Qwen2.5-14B-Instruct/ --seq-length 131072 --max-position-embeddings 131072 --micro-batch-size 1 --global-batch-size 64 --make-vocab-size-divisible-by 1 --padded-vocab-size 152064 --rotary-base 1000000.0 --lr 5.00e-6 --train-iters 1000 --lr-decay-style cosine --untie-embeddings-and-output-weights --disable-bias-linear --attention-dropout 0.0 --init-method-std 0.01 --hidden-dropout 0.0 --position-embedding-type rope --normalization RMSNorm --use-fused-rmsnorm --norm-epsilon 1e-6 --swiglu --use-flash-attn --use-fused-rotary-pos-emb --use-rotary-position-embeddings --use-fused-swiglu --use-mc2 --no-masked-softmax-fusion --attention-softmax-in-fp32 --min-lr 1.00e-7 --weight-decay 0.0 --lr-warmup-fraction 0.03 --clip-grad 1.0 --adam-beta1 0.9 --adam-beta2 0.999 --add-qkv-bias --initial-loss-scale 4096 --no-gradient-accumulation-fusion --use-distributed-optimizer --bf16 --overlap-grad-reduce --finetune --vision-model-freeze --vision-model-type intern_300m --vision-downsample-ratio 0.5 --vision-projector-type mlp --vision-projector-pre-norm --vision-process-type dynamic --vision-normalize-type imagenet --vision-seq-length 1025 --image-token-length 256 --image-size 448 --prompt-format qwen2 --is-instruction-dataset --max-num-frame 512 --max-fps 1 --add-class-token --min-patch-grid 1 --max-patch-grid 12 --cross-dataset-joint --data-path /local_disk/cognitron_vl//configs/lcvlm_finetune_stage3.yaml --split 100,0,0 --data-seq-length 131072 --num-workers 8 --load /data_2/output/LM/lcvlm_modellink/scripts/qwen25/finetune_qwen25_14b_intern_300m_ptd_tp8pp1_stage2.sh/20241014_131952/ --vit-load / --no-load-optim --no-load-rng --seed 424242 --save /data_2/output/LM/scripts/modellink/qwen25/finetune_qwen25_14b_intern_300m_ptd_tp8pp1cp2_stage3.sh/20241127_204213// --log-interval 1 --save-interval 100 --eval-interval 100 --eval-iters 0 --log-throughput --distributed-timeout-minutes 120 --distributed-backend nccl
[2024-11-27 12:44:15,231] torch.distributed.run: [WARNING] 
[2024-11-27 12:44:15,231] torch.distributed.run: [WARNING] *****************************************
[2024-11-27 12:44:15,231] torch.distributed.run: [WARNING] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. 
[2024-11-27 12:44:15,231] torch.distributed.run: [WARNING] *****************************************
/root/miniconda3/envs/py38/lib/python3.8/site-packages/torch_npu/contrib/transfer_to_npu.py:299: ImportWarning: 
    *************************************************************************************************************
    The torch.Tensor.cuda and torch.nn.Module.cuda are replaced with torch.Tensor.npu and torch.nn.Module.npu now..
    The torch.cuda.DoubleTensor is replaced with torch.npu.FloatTensor cause the double type is not supported now..
    The backend in torch.distributed.init_process_group set to hccl now..
    The torch.cuda.* and torch.cuda.amp.* are replaced with torch.npu.* and torch.npu.amp.* now..
    The device parameters have been replaced with npu in the function below:
    torch.logspace, torch.randint, torch.hann_window, torch.rand, torch.full_like, torch.ones_like, torch.rand_like, torch.randperm, torch.arange, torch.frombuffer, torch.normal, torch._empty_per_channel_affine_quantized, torch.empty_strided, torch.empty_like, torch.scalar_tensor, torch.tril_indices, torch.bartlett_window, torch.ones, torch.sparse_coo_tensor, torch.randn, torch.kaiser_window, torch.tensor, torch.triu_indices, torch.as_tensor, torch.zeros, torch.randint_like, torch.full, torch.eye, torch._sparse_csr_tensor_unsafe, torch.empty, torch._sparse_coo_tensor_unsafe, torch.blackman_window, torch.zeros_like, torch.range, torch.sparse_csr_tensor, torch.randn_like, torch.from_file, torch._cudnn_init_dropout_state, torch._empty_affine_quantized, torch.linspace, torch.hamming_window, torch.empty_quantized, torch._pin_memory, torch.autocast, torch.load, torch.Generator, torch.set_default_device, torch.Tensor.new_empty, torch.Tensor.new_empty_strided, torch.Tensor.new_full, torch.Tensor.new_ones, torch.Tensor.new_tensor, torch.Tensor.new_zeros, torch.Tensor.to, torch.nn.Module.to, torch.nn.Module.to_empty
    *************************************************************************************************************
    
  warnings.warn(msg, ImportWarning)
/root/miniconda3/envs/py38/lib/python3.8/site-packages/torch_npu/contrib/transfer_to_npu.py:260: RuntimeWarning: torch.jit.script and torch.jit.script_method will be disabled by transfer_to_npu, which currently does not support them, if you need to enable them, please do not use transfer_to_npu.
  warnings.warn(msg, RuntimeWarning)
Using /root/.cache/torch_extensions/py38_cpu as PyTorch extensions root...
Creating extension directory /root/.cache/torch_extensions/py38_cpu/adaptive_cp...
Emitting ninja build file /root/.cache/torch_extensions/py38_cpu/adaptive_cp/build.ninja...
Building extension module adaptive_cp...
Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)
Using /root/.cache/torch_extensions/py38_cpu as PyTorch extensions root...
Using /root/.cache/torch_extensions/py38_cpu as PyTorch extensions root...
Using /root/.cache/torch_extensions/py38_cpu as PyTorch extensions root...
Using /root/.cache/torch_extensions/py38_cpu as PyTorch extensions root...
Using /root/.cache/torch_extensions/py38_cpu as PyTorch extensions root...
Using /root/.cache/torch_extensions/py38_cpu as PyTorch extensions root...
Using /root/.cache/torch_extensions/py38_cpu as PyTorch extensions root...
Using /root/.cache/torch_extensions/py38_cpu as PyTorch extensions root...
Using /root/.cache/torch_extensions/py38_cpu as PyTorch extensions root...
Using /root/.cache/torch_extensions/py38_cpu as PyTorch extensions root...
Using /root/.cache/torch_extensions/py38_cpu as PyTorch extensions root...
Using /root/.cache/torch_extensions/py38_cpu as PyTorch extensions root...
Using /root/.cache/torch_extensions/py38_cpu as PyTorch extensions root...
Using /root/.cache/torch_extensions/py38_cpu as PyTorch extensions root...
Using /root/.cache/torch_extensions/py38_cpu as PyTorch extensions root...
[1/2] c++ -MMD -MF adaptive_cp.o.d -DTORCH_EXTENSION_NAME=adaptive_cp -DTORCH_API_INCLUDE_EXTENSION_H -DPYBIND11_COMPILER_TYPE=\"_gcc\" -DPYBIND11_STDLIB=\"_libstdcpp\" -DPYBIND11_BUILD_ABI=\"_cxxabi1011\" -I/usr/local/Ascend/ascend-toolkit/latest/include -I/root/miniconda3/envs/py38/lib/python3.8/site-packages/torch_npu/include -I/root/miniconda3/envs/py38/lib/python3.8/site-packages/torch_npu/third_party -I/root/miniconda3/envs/py38/lib/python3.8/site-packages/torch_npu/acl -I/root/miniconda3/envs/py38/lib/python3.8/site-packages/torch_npu/inc -isystem /root/miniconda3/envs/py38/lib/python3.8/site-packages/torch/include -isystem /root/miniconda3/envs/py38/lib/python3.8/site-packages/torch/include/torch/csrc/api/include -isystem /root/miniconda3/envs/py38/lib/python3.8/site-packages/torch/include/TH -isystem /root/miniconda3/envs/py38/lib/python3.8/site-packages/torch/include/THC -isystem /root/miniconda3/envs/py38/include/python3.8 -D_GLIBCXX_USE_CXX11_ABI=0 -fPIC -std=c++17 -fstack-protector-all -Wl,-z,relro,-z,now,-z,noexecstack -fPIC -pie -Wl,--disable-new-dtags,--rpath -s -O2 -c local_disk/cognitron_vl/third_party/MindSpeed_core_r0.6.0/mindspeed/ops/csrc/algorithm/adaptive_cp/adaptive_cp.cpp -o adaptive_cp.o 
[2/2] c++ adaptive_cp.o -shared -L/usr/local/Ascend/ascend-toolkit/latest/lib64 -lascendcl -L/root/miniconda3/envs/py38/lib/python3.8/site-packages/torch_npu/lib -ltorch_npu -L/root/miniconda3/envs/py38/lib/python3.8/site-packages/torch/lib -lc10 -ltorch_cpu -ltorch -ltorch_python -o adaptive_cp.so
Loading extension module adaptive_cp...
local_disk/cognitron_vl/third_party/MindSpeed_core_r0.6.0/mindspeed/core/tensor_parallel/layers.py:30: UserWarning: failed to generate the npu_matmul_add_fp32
  warnings.warn("failed to generate the npu_matmul_add_fp32")
Loading extension module adaptive_cp...
local_disk/cognitron_vl/third_party/MindSpeed_core_r0.6.0/mindspeed/core/tensor_parallel/layers.py:30: UserWarning: failed to generate the npu_matmul_add_fp32
  warnings.warn("failed to generate the npu_matmul_add_fp32")
Loading extension module adaptive_cp...
Loading extension module adaptive_cp...
local_disk/cognitron_vl/third_party/MindSpeed_core_r0.6.0/mindspeed/core/tensor_parallel/layers.py:30: UserWarning: failed to generate the npu_matmul_add_fp32
  warnings.warn("failed to generate the npu_matmul_add_fp32")
Loading extension module adaptive_cp...
local_disk/cognitron_vl/third_party/MindSpeed_core_r0.6.0/mindspeed/core/tensor_parallel/layers.py:30: UserWarning: failed to generate the npu_matmul_add_fp32
  warnings.warn("failed to generate the npu_matmul_add_fp32")
Loading extension module adaptive_cp...
Loading extension module adaptive_cp...
local_disk/cognitron_vl/third_party/MindSpeed_core_r0.6.0/mindspeed/core/tensor_parallel/layers.py:30: UserWarning: failed to generate the npu_matmul_add_fp32
  warnings.warn("failed to generate the npu_matmul_add_fp32")
local_disk/cognitron_vl/third_party/MindSpeed_core_r0.6.0/mindspeed/core/tensor_parallel/layers.py:30: UserWarning: failed to generate the npu_matmul_add_fp32
  warnings.warn("failed to generate the npu_matmul_add_fp32")
Loading extension module adaptive_cp...
local_disk/cognitron_vl/third_party/MindSpeed_core_r0.6.0/mindspeed/core/tensor_parallel/layers.py:30: UserWarning: failed to generate the npu_matmul_add_fp32
  warnings.warn("failed to generate the npu_matmul_add_fp32")
local_disk/cognitron_vl/third_party/MindSpeed_core_r0.6.0/mindspeed/core/tensor_parallel/layers.py:30: UserWarning: failed to generate the npu_matmul_add_fp32
  warnings.warn("failed to generate the npu_matmul_add_fp32")
Loading extension module adaptive_cp...
Loading extension module adaptive_cp...
local_disk/cognitron_vl/third_party/MindSpeed_core_r0.6.0/mindspeed/core/tensor_parallel/layers.py:30: UserWarning: failed to generate the npu_matmul_add_fp32
  warnings.warn("failed to generate the npu_matmul_add_fp32")
Loading extension module adaptive_cp...
local_disk/cognitron_vl/third_party/MindSpeed_core_r0.6.0/mindspeed/core/tensor_parallel/layers.py:30: UserWarning: failed to generate the npu_matmul_add_fp32
  warnings.warn("failed to generate the npu_matmul_add_fp32")
local_disk/cognitron_vl/third_party/MindSpeed_core_r0.6.0/mindspeed/core/tensor_parallel/layers.py:30: UserWarning: failed to generate the npu_matmul_add_fp32
  warnings.warn("failed to generate the npu_matmul_add_fp32")
Loading extension module adaptive_cp...
local_disk/cognitron_vl/third_party/MindSpeed_core_r0.6.0/mindspeed/core/tensor_parallel/layers.py:30: UserWarning: failed to generate the npu_matmul_add_fp32
  warnings.warn("failed to generate the npu_matmul_add_fp32")
Loading extension module adaptive_cp...
Loading extension module adaptive_cp...
local_disk/cognitron_vl/third_party/MindSpeed_core_r0.6.0/mindspeed/core/tensor_parallel/layers.py:30: UserWarning: failed to generate the npu_matmul_add_fp32
  warnings.warn("failed to generate the npu_matmul_add_fp32")
local_disk/cognitron_vl/third_party/MindSpeed_core_r0.6.0/mindspeed/core/tensor_parallel/layers.py:30: UserWarning: failed to generate the npu_matmul_add_fp32
  warnings.warn("failed to generate the npu_matmul_add_fp32")
Loading extension module adaptive_cp...
Loading extension module adaptive_cp...
local_disk/cognitron_vl/third_party/MindSpeed_core_r0.6.0/mindspeed/core/tensor_parallel/layers.py:30: UserWarning: failed to generate the npu_matmul_add_fp32
  warnings.warn("failed to generate the npu_matmul_add_fp32")
local_disk/cognitron_vl/third_party/MindSpeed_core_r0.6.0/mindspeed/core/tensor_parallel/layers.py:30: UserWarning: failed to generate the npu_matmul_add_fp32
  warnings.warn("failed to generate the npu_matmul_add_fp32")
/root/miniconda3/envs/py38/lib/python3.8/site-packages/torchvision/io/image.py:13: UserWarning: Failed to load image Python extension: 'libc10_cuda.so: cannot open shared object file: No such file or directory'If you don't plan on using image functionality from `torchvision.io`, you can ignore this warning. Otherwise, there might be something wrong with your environment. Did you have `libjpeg` or `libpng` installed before building `torchvision` from source?
  warn(
/root/miniconda3/envs/py38/lib/python3.8/site-packages/torchvision/io/image.py:13: UserWarning: Failed to load image Python extension: 'libc10_cuda.so: cannot open shared object file: No such file or directory'If you don't plan on using image functionality from `torchvision.io`, you can ignore this warning. Otherwise, there might be something wrong with your environment. Did you have `libjpeg` or `libpng` installed before building `torchvision` from source?
  warn(
/root/miniconda3/envs/py38/lib/python3.8/site-packages/torchvision/io/image.py:13: UserWarning: Failed to load image Python extension: 'libc10_cuda.so: cannot open shared object file: No such file or directory'If you don't plan on using image functionality from `torchvision.io`, you can ignore this warning. Otherwise, there might be something wrong with your environment. Did you have `libjpeg` or `libpng` installed before building `torchvision` from source?
  warn(
/root/miniconda3/envs/py38/lib/python3.8/site-packages/torchvision/io/image.py:13: UserWarning: Failed to load image Python extension: 'libc10_cuda.so: cannot open shared object file: No such file or directory'If you don't plan on using image functionality from `torchvision.io`, you can ignore this warning. Otherwise, there might be something wrong with your environment. Did you have `libjpeg` or `libpng` installed before building `torchvision` from source?
  warn(
/root/miniconda3/envs/py38/lib/python3.8/site-packages/torchvision/io/image.py:13: UserWarning: Failed to load image Python extension: 'libc10_cuda.so: cannot open shared object file: No such file or directory'If you don't plan on using image functionality from `torchvision.io`, you can ignore this warning. Otherwise, there might be something wrong with your environment. Did you have `libjpeg` or `libpng` installed before building `torchvision` from source?
  warn(
/root/miniconda3/envs/py38/lib/python3.8/site-packages/torchvision/io/image.py:13: UserWarning: Failed to load image Python extension: 'libc10_cuda.so: cannot open shared object file: No such file or directory'If you don't plan on using image functionality from `torchvision.io`, you can ignore this warning. Otherwise, there might be something wrong with your environment. Did you have `libjpeg` or `libpng` installed before building `torchvision` from source?
  warn(
/root/miniconda3/envs/py38/lib/python3.8/site-packages/torchvision/io/image.py:13: UserWarning: Failed to load image Python extension: 'libc10_cuda.so: cannot open shared object file: No such file or directory'If you don't plan on using image functionality from `torchvision.io`, you can ignore this warning. Otherwise, there might be something wrong with your environment. Did you have `libjpeg` or `libpng` installed before building `torchvision` from source?
  warn(
/root/miniconda3/envs/py38/lib/python3.8/site-packages/torchvision/io/image.py:13: UserWarning: Failed to load image Python extension: 'libc10_cuda.so: cannot open shared object file: No such file or directory'If you don't plan on using image functionality from `torchvision.io`, you can ignore this warning. Otherwise, there might be something wrong with your environment. Did you have `libjpeg` or `libpng` installed before building `torchvision` from source?
  warn(
/root/miniconda3/envs/py38/lib/python3.8/site-packages/torchvision/io/image.py:13: UserWarning: Failed to load image Python extension: 'libc10_cuda.so: cannot open shared object file: No such file or directory'If you don't plan on using image functionality from `torchvision.io`, you can ignore this warning. Otherwise, there might be something wrong with your environment. Did you have `libjpeg` or `libpng` installed before building `torchvision` from source?
  warn(
/root/miniconda3/envs/py38/lib/python3.8/site-packages/torchvision/io/image.py:13: UserWarning: Failed to load image Python extension: 'libc10_cuda.so: cannot open shared object file: No such file or directory'If you don't plan on using image functionality from `torchvision.io`, you can ignore this warning. Otherwise, there might be something wrong with your environment. Did you have `libjpeg` or `libpng` installed before building `torchvision` from source?
  warn(
/root/miniconda3/envs/py38/lib/python3.8/site-packages/torchvision/io/image.py:13: UserWarning: Failed to load image Python extension: 'libc10_cuda.so: cannot open shared object file: No such file or directory'If you don't plan on using image functionality from `torchvision.io`, you can ignore this warning. Otherwise, there might be something wrong with your environment. Did you have `libjpeg` or `libpng` installed before building `torchvision` from source?
  warn(
/root/miniconda3/envs/py38/lib/python3.8/site-packages/torchvision/io/image.py:13: UserWarning: Failed to load image Python extension: 'libc10_cuda.so: cannot open shared object file: No such file or directory'If you don't plan on using image functionality from `torchvision.io`, you can ignore this warning. Otherwise, there might be something wrong with your environment. Did you have `libjpeg` or `libpng` installed before building `torchvision` from source?
  warn(
/root/miniconda3/envs/py38/lib/python3.8/site-packages/torchvision/io/image.py:13: UserWarning: Failed to load image Python extension: 'libc10_cuda.so: cannot open shared object file: No such file or directory'If you don't plan on using image functionality from `torchvision.io`, you can ignore this warning. Otherwise, there might be something wrong with your environment. Did you have `libjpeg` or `libpng` installed before building `torchvision` from source?
  warn(
/root/miniconda3/envs/py38/lib/python3.8/site-packages/torchvision/io/image.py:13: UserWarning: Failed to load image Python extension: 'libc10_cuda.so: cannot open shared object file: No such file or directory'If you don't plan on using image functionality from `torchvision.io`, you can ignore this warning. Otherwise, there might be something wrong with your environment. Did you have `libjpeg` or `libpng` installed before building `torchvision` from source?
  warn(
/root/miniconda3/envs/py38/lib/python3.8/site-packages/torchvision/io/image.py:13: UserWarning: Failed to load image Python extension: 'libc10_cuda.so: cannot open shared object file: No such file or directory'If you don't plan on using image functionality from `torchvision.io`, you can ignore this warning. Otherwise, there might be something wrong with your environment. Did you have `libjpeg` or `libpng` installed before building `torchvision` from source?
  warn(
/root/miniconda3/envs/py38/lib/python3.8/site-packages/torchvision/io/image.py:13: UserWarning: Failed to load image Python extension: 'libc10_cuda.so: cannot open shared object file: No such file or directory'If you don't plan on using image functionality from `torchvision.io`, you can ignore this warning. Otherwise, there might be something wrong with your environment. Did you have `libjpeg` or `libpng` installed before building `torchvision` from source?
  warn(
using world size: 512, data-parallel size: 32, context-parallel size: 2 tensor-model-parallel size: 8, pipeline-model-parallel size: 1 
WARNING: Setting args.overlap_p2p_comm to False since non-interleaved schedule does not support overlapping p2p communication
accumulate and all-reduce gradients in fp32 for bfloat16 data type.
using torch.bfloat16 for parameters ...
[INFO] Setting args.use_flash_attn=True since context parallel is enabled.
[INFO] Setting args.create_attention_mask_in_dataloader to False since reset_data=False or alibi_without_flash_attn=False or args.tokenizer_padding_side=right
------------------------ ModelLink Arguments ------------------------
  accumulate_allreduce_grads_in_fp32 .............. True
  adam_beta1 ...................................... 0.9
  adam_beta2 ...................................... 0.999
  adam_eps ........................................ 1e-08
  adaptive_cp_dynamic_attn_mask ................... False
  adaptive_cp_manually_set_mask_list .............. False
  adaptive_cp_only_reschedule ..................... False
  adaptive_cp_without_coarse ...................... False
  adaptive_recompute_device_size .................. -1
  adaptive_recompute_device_swap .................. False
  adaptive_recompute_profiling_step ............... 10
  add_bias_linear ................................. False
  add_class_token ................................. True
  add_dense_bias .................................. False
  add_position_embedding .......................... True
  add_qkv_bias .................................... True
  add_rmsnorm_offset .............................. False
  adlr_autoresume ................................. False
  adlr_autoresume_interval ........................ 1000
  apply_layernorm_1p .............................. False
  apply_query_key_layer_scaling ................... False
  apply_residual_connection_post_layernorm ........ False
  apply_rope_fusion ............................... True
  async_tensor_model_parallel_allreduce ........... False
  attention_dropout ............................... 0.0
  attention_mask_on_cpu ........................... False
  attention_softmax_in_fp32 ....................... True
  attn_logit_softcapping .......................... None
  auto_detect_ckpt_format ......................... False
  barrier_with_L1_time ............................ True
  bert_binary_head ................................ True
  bert_embedder_type .............................. megatron
  bert_load ....................................... None
  bf16 ............................................ True
  bias_dropout_fusion ............................. True
  bias_gelu_fusion ................................ False
  bias_swiglu_fusion .............................. True
  biencoder_projection_dim ........................ 0
  biencoder_shared_query_context_model ............ False
  block_data_path ................................. None
  check_for_nan_in_loss_and_grad .................. True
  chunk_size ...................................... 4096
  ckpt_fully_parallel_save ........................ False
  ckpt_step ....................................... None
  classes_fraction ................................ 1.0
  clip_grad ....................................... 1.0
  clone_scatter_output_in_embedding ............... True
  consumed_train_samples .......................... 0
  consumed_valid_samples .......................... 0
  context_parallel_algo ........................... megatron_cp_algo
  context_parallel_size ........................... 2
  cp_attention_mask_type .......................... causal
  cp_window_size .................................. 1
  create_attention_mask_in_dataloader ............. False
  cross_dataset_joint ............................. True
  data_cache_path ................................. None
  data_parallel_random_init ....................... False
  data_parallel_size .............................. 32
  data_path ....................................... ['/local_disk/cognitron_vl//configs/lcvlm_finetune_stage3.yaml']
  data_per_class_fraction ......................... 1.0
  data_seq_length ................................. 131072
  data_sharding ................................... True
  dataloader_type ................................. single
  decoder_num_layers .............................. None
  decoder_seq_length .............................. None
  decoupled_lr .................................... None
  decoupled_min_lr ................................ None
  delay_grad_reduce ............................... True
  delay_param_gather .............................. False
  dim_model_base .................................. None
  dino_bottleneck_size ............................ 256
  dino_freeze_last_layer .......................... 1
  dino_head_hidden_size ........................... 2048
  dino_local_crops_number ......................... 10
  dino_local_img_size ............................. 96
  dino_norm_last_layer ............................ False
  dino_teacher_temp ............................... 0.07
  dino_warmup_teacher_temp ........................ 0.04
  dino_warmup_teacher_temp_epochs ................. 30
  dist_ckpt_format ................................ torch_dist
  distribute_saved_activations .................... False
  distributed_backend ............................. nccl
  distributed_timeout_minutes ..................... 120
  dpo_beta ........................................ 0.1
  dpo_ftx ......................................... 0.0
  dpo_label_smoothing ............................. 0.0
  dpo_loss_type ................................... sigmoid
  embed_layernorm ................................. False
  embedding_multiplier_scale ...................... 1.0
  embedding_path .................................. None
  empty_unused_memory_level ....................... 0
  enable_chunk_memory ............................. False
  enable_chunk_sequence ........................... False
  enable_hbmfault_repair .......................... False
  enable_high_availability ........................ False
  enable_one_logger ............................... False
  enable_optimizer_state_local_copy ............... False
  enable_recompute_layers_per_pp_rank ............. False
  enable_token_rearrange_opt ...................... False
  encoder_num_layers .............................. 48
  encoder_seq_length .............................. 131072
  end_weight_decay ................................ 0.0
  eod_mask_loss ................................... False
  eval_interval ................................... 100
  eval_iters ...................................... 0
  evidence_data_path .............................. None
  exit_duration_in_mins ........................... None
  exit_interval ................................... None
  exit_on_missing_checkpoint ...................... False
  exit_signal_handler ............................. False
  expert_interval ................................. 1
  expert_model_parallel_size ...................... 1
  ffn_hidden_size ................................. 13824
  fill_neg_inf .................................... False
  finetune ........................................ True
  first_k_dense_replace ........................... None
  first_pipeline_num_layers ....................... 0
  fp16 ............................................ False
  fp16_lm_cross_entropy ........................... False
  fp32_residual_connection ........................ False
  fp8 ............................................. None
  fp8_amax_compute_algo ........................... most_recent
  fp8_amax_history_len ............................ 1
  fp8_interval .................................... 1
  fp8_margin ...................................... 0
  fp8_wgrad ....................................... True
  full_shuffle_instruction_dataset ................ False
  geglu ........................................... False
  gelu_tanh ....................................... False
  global_batch_size ............................... 64
  gradient_accumulation_fusion .................... False
  group_query_attention ........................... True
  head_lr_mult .................................... 1.0
  hidden_dropout .................................. 0.0
  hidden_size ..................................... 5120
  high_freq_factor ................................ None
  hysteresis ...................................... 2
  ict_head_size ................................... None
  ict_load ........................................ None
  image_size ...................................... 448
  image_token_length .............................. 256
  img_h ........................................... 224
  img_w ........................................... 224
  independent_parallel ............................ False
  indexer_batch_size .............................. 128
  indexer_log_interval ............................ 1000
  inference_batch_times_seqlen_threshold .......... 512
  init_method_std ................................. 0.01
  init_method_xavier_uniform ...................... False
  initial_loss_scale .............................. 4096.0
  input_embeds_norm ............................... False
  input_jitter .................................... True
  input_layernorm_in_fp32 ......................... False
  interleave_sliding_window ....................... None
  is_instruction_dataset .......................... True
  is_pairwise_dataset ............................. False
  iter_per_epoch .................................. 1250
  jit_compile ..................................... False
  kv_channels ..................................... 128
  kv_head_repeat_before_uly_alltoall .............. True
  kv_lora_rank .................................... None
  language_model_freeze ........................... False
  lazy_mpu_init ................................... None
  load ............................................ /data_2/output/LM/lcvlm_modellink/scripts/qwen25/finetune_qwen25_14b_intern_300m_ptd_tp8pp1_stage2.sh/20241014_131952/
  load_checkpoint_loosely ......................... False
  local_rank ...................................... None
  log_batch_size_to_tensorboard ................... False
  log_interval .................................... 1
  log_learning_rate_to_tensorboard ................ True
  log_loss_scale_to_tensorboard ................... True
  log_memory_to_tensorboard ....................... False
  log_num_zeros_in_grad ........................... False
  log_params_norm ................................. False
  log_progress .................................... False
  log_throughput .................................. True
  log_timers_to_tensorboard ....................... False
  log_validation_ppl_to_tensorboard ............... False
  log_world_size_to_tensorboard ................... False
  logit_mask ...................................... False
  lora_alpha ...................................... 32
  lora_fusion ..................................... False
  lora_load ....................................... None
  lora_modules_to_save ............................ None
  lora_r .......................................... 16
  lora_register_forward_hook ...................... ['word_embeddings', 'input_layernorm']
  lora_target_modules ............................. []
  loss_scale ...................................... None
  loss_scale_window ............................... 1000
  low_freq_factor ................................. None
  lr .............................................. 5e-06
  lr_decay_iters .................................. None
  lr_decay_samples ................................ None
  lr_decay_style .................................. cosine
  lr_warmup_fraction .............................. 0.03
  lr_warmup_init .................................. 0.0
  lr_warmup_iters ................................. 0
  lr_warmup_samples ............................... 0
  make_vocab_size_divisible_by .................... 1
  manual_gc ....................................... False
  manual_gc_eval .................................. True
  manual_gc_interval .............................. 0
  mask_factor ..................................... 1.0
  mask_prob ....................................... 0.15
  mask_type ....................................... random
  masked_softmax_fusion ........................... False
  max_fps ......................................... 1
  max_num_frame ................................... 512
  max_num_image ................................... 8
  max_patch_grid .................................. 12
  max_position_embeddings ......................... 131072
  max_tokens_to_oom ............................... 12000
  merge_file ...................................... None
  micro_batch_size ................................ 1
  min_loss_scale .................................. 1.0
  min_lr .......................................... 1e-07
  min_patch_grid .................................. 1
  mmap_bin_files .................................. True
  mock_data ....................................... False
  moe_allgather_overlap_comm ...................... False
  moe_alltoall_overlap_comm ....................... False
  moe_aux_loss_coeff .............................. 0.0
  moe_comm_aux_loss_coeff ......................... 0.0
  moe_device_level_aux_loss_coeff ................. 0.0
  moe_expert_capacity_factor ...................... None
  moe_grouped_gemm ................................ False
  moe_input_jitter_eps ............................ None
  moe_intermediate_size ........................... None
  moe_layer_freq .................................. None
  moe_pad_expert_input_to_capacity ................ False
  moe_per_layer_logging ........................... False
  moe_permutation_async_comm ...................... False
  moe_router_load_balancing_type .................. aux_loss
  moe_router_topk ................................. 2
  moe_token_dispatcher_type ....................... allgather
  moe_token_drop_policy ........................... probs
  moe_token_dropping .............................. False
  moe_tp_extend_ep ................................ False
  moe_train_capacity_factor ....................... 1.0
  moe_without_activation .......................... False
  moe_z_loss_coeff ................................ 0.0
  moe_zero_memory ................................. disable
  multi_head_latent_attention ..................... False
  n_shared_experts ................................ None
  nccl_communicator_config_path ................... None
  next_tockens .................................... 0
  no_load_optim ................................... True
  no_load_rng ..................................... True
  no_persist_layer_norm ........................... False
  no_post_layer_norm .............................. False
  no_save_optim ................................... None
  no_save_rng ..................................... None
  no_shared_storage ............................... False
  no_shuffle ...................................... False
  noisy_gate_policy ............................... None
  noop_layers ..................................... None
  norm_epsilon .................................... 1e-06
  norm_topk_prob .................................. False
  normalization ................................... RMSNorm
  num_attention_heads ............................. 40
  num_channels .................................... 3
  num_classes ..................................... 1000
  num_experts ..................................... None
  num_layer_list .................................. None
  num_layers ...................................... 48
  num_layers_per_virtual_pipeline_stage ........... None
  num_query_groups ................................ 8
  num_workers ..................................... 8
  one_logger_entity ............................... hwinf_dcm
  one_logger_project .............................. e2e-tracking
  one_logger_run_name ............................. None
  onnx_safe ....................................... None
  openai_gelu ..................................... False
  optimizer ....................................... adam
  original_max_position_embeddings ................ None
  output_bert_embeddings .......................... False
  output_layer_slice_num .......................... 1
  output_logit_softcapping ........................ None
  output_multiplier_scale ......................... None
  overlap_grad_reduce ............................. True
  overlap_p2p_comm ................................ False
  overlap_param_gather ............................ False
  override_opt_param_scheduler .................... False
  pad_to_multiple_of .............................. 8
  padded_vocab_size ............................... 152064
  params_dtype .................................... torch.bfloat16
  patch_dim ....................................... 16
  perform_initialization .......................... True
  pipeline_model_parallel_size .................... 1
  pipeline_model_parallel_split_rank .............. None
  position_embedding_type ......................... rope
  post_norm ....................................... False
  pre_tockens ..................................... 65536
  pref_ftx ........................................ 0.0
  pretrained_checkpoint ........................... None
  profile ......................................... False
  profile_level ................................... level0
  profile_ranks ................................... [-1]
  profile_record_shapes ........................... False
  profile_save_path ............................... ./profile_dir
  profile_step_end ................................ 12
  profile_step_start .............................. 10
  profile_with_cpu ................................ False
  profile_with_memory ............................. False
  profile_with_stack .............................. False
  prompt_format ................................... qwen2
  prompt_type ..................................... None
  q_lora_rank ..................................... None
  qk_layernorm .................................... False
  qk_nope_head_dim ................................ None
  qk_rope_head_dim ................................ None
  query_in_block_prob ............................. 0.1
  query_pre_attn_scalar ........................... None
  rampup_batch_size ............................... None
  rank ............................................ 0
  recompute_activation_function ................... False
  recompute_activation_function_num_layers ........ None
  recompute_granularity ........................... full
  recompute_in_advance ............................ False
  recompute_in_bubble ............................. False
  recompute_method ................................ block
  recompute_num_layers ............................ 48
  reduce_recompute_for_last_chunk ................. False
  ref_model ....................................... None
  reset_attention_mask ............................ False
  reset_position_ids .............................. False
  retriever_report_topk_accuracies ................ []
  retriever_score_scaling ......................... False
  retriever_seq_length ............................ 256
  retro_add_retriever ............................. False
  retro_attention_gate ............................ 1
  retro_cyclic_train_iters ........................ None
  retro_encoder_attention_dropout ................. 0.1
  retro_encoder_hidden_dropout .................... 0.1
  retro_encoder_layers ............................ 2
  retro_num_neighbors ............................. 2
  retro_num_retrieved_chunks ...................... 2
  retro_project_dir ............................... None
  retro_verify_neighbor_count ..................... True
  reuse_fp32_param ................................ False
  rope_scaling_beta_fast .......................... 32
  rope_scaling_beta_slow .......................... 1
  rope_scaling_factor ............................. 1.0
  rope_scaling_mscale ............................. 1.0
  rope_scaling_mscale_all_dim ..................... 0.0
  rope_scaling_original_max_position_embeddings ... None
  rope_scaling_type ............................... None
  rotary_base ..................................... 1000000.0
  rotary_interleaved .............................. False
  rotary_percent .................................. 1.0
  rotary_seq_len_interpolation_factor ............. None
  routed_scaling_factor ........................... None
  sample_rate ..................................... 1.0
  save ............................................ /data_2/output/LM/scripts/modellink/qwen25/finetune_qwen25_14b_intern_300m_ptd_tp8pp1cp2_stage3.sh/20241127_204213//
  save_interval ................................... 100
  scale_depth ..................................... None
  scale_emb ....................................... None
  scatter_gather_tensors_in_pipeline .............. True
  seed ............................................ 424242
  seq_aux ......................................... False
  seq_length ...................................... 131072
  sequence_parallel ............................... True
  sgd_momentum .................................... 0.9
  shape_order ..................................... SBH
  shared_expert_gate .............................. False
  shared_expert_gate_output_dimension ............. 1
  short_seq_prob .................................. 0.1
  skip_bias_add ................................... True
  skip_train ...................................... False
  sliding_window .................................. None
  sparse_mode ..................................... 4
  spec ............................................ None
  split ........................................... 100,0,0
  square_alibi_mask ............................... False
  squared_relu .................................... False
  stage ........................................... None
  standalone_embedding_stage ...................... False
  start_weight_decay .............................. 0.0
  swap_attention .................................. False
  swap_modules .................................... None
  swiglu .......................................... True
  swin_backbone_type .............................. tiny
  tensor_model_parallel_size ...................... 8
  tensorboard_dir ................................. None
  tensorboard_log_interval ........................ 1
  tensorboard_queue_size .......................... 1000
  test_data_path .................................. None
  test_mode ....................................... False
  timing_log_level ................................ 0
  timing_log_option ............................... minmax
  titles_data_path ................................ None
  tokenizer_kwargs ................................ None
  tokenizer_model ................................. None
  tokenizer_name_or_path .......................... /data_4/models/Qwen/Qwen2.5-14B-Instruct/
  tokenizer_not_use_fast .......................... True
  tokenizer_padding_side .......................... right
  tokenizer_type .................................. PretrainedFromHF
  topk_group ...................................... None
  tp_2d ........................................... False
  tp_comm_bulk_dgrad .............................. True
  tp_comm_bulk_wgrad .............................. True
  tp_comm_overlap ................................. False
  tp_comm_overlap_ag .............................. True
  tp_comm_overlap_cfg ............................. None
  tp_comm_overlap_rs .............................. True
  tp_comm_split_ag ................................ True
  tp_comm_split_rs ................................ True
  tp_x ............................................ 1
  tp_y ............................................ 1
  train_data_path ................................. None
  train_iters ..................................... 1000
  train_samples ................................... None
  transformer_impl ................................ local
  transformer_pipeline_model_parallel_size ........ 1
  ulysses_degree_in_cp ............................ None
  untie_embeddings_and_output_weights ............. True
  use_checkpoint_args ............................. False
  use_checkpoint_opt_param_scheduler .............. False
  use_cp_send_recv_overlap ........................ True
  use_cpu_initialization .......................... None
  use_deter_comp .................................. False
  use_dist_ckpt ................................... False
  use_distributed_optimizer ....................... True
  use_flash_attn .................................. True
  use_fused_moe_token_permute_and_unpermute ....... False
  use_fused_ring_attention_update ................. False
  use_fused_rmsnorm ............................... True
  use_fused_rotary_pos_emb ........................ True
  use_fused_swiglu ................................ True
  use_glm_rope .................................... False
  use_mc2 ......................................... False
  use_mcore_models ................................ True
  use_one_sent_docs ............................... False
  use_ring_exchange_p2p ........................... False
  use_rotary_position_embeddings .................. True
  v_head_dim ...................................... None
  valid_data_path ................................. None
  variable_seq_lengths ............................ False
  virtual_pipeline_model_parallel_size ............ None
  vision_backbone_type ............................ vit
  vision_context_parallel ......................... False
  vision_downsample_ratio ......................... 0.5
  vision_downsample_stride ........................ 1.0
  vision_model_freeze ............................. True
  vision_model_lr_decay_rate ...................... 1.0
  vision_model_lr_mult ............................ 1.0
  vision_model_recompute .......................... False
  vision_model_type ............................... intern_300m
  vision_normalize_type ........................... imagenet
  vision_pretraining .............................. False
  vision_pretraining_type ......................... classify
  vision_process_type ............................. dynamic
  vision_projector_freeze ......................... False
  vision_projector_pre_norm ....................... True
  vision_projector_recompute ...................... False
  vision_projector_type ........................... mlp
  vision_seq_length ............................... 1025
  vit_load ........................................ /
  vocab_extra_ids ................................. 0
  vocab_file ...................................... None
  vocab_size ...................................... None
  wandb_exp_name .................................. 
  wandb_project ................................... 
  wandb_save_dir .................................. 
  weight_decay .................................... 0.0
  weight_decay_incr_style ......................... constant
  world_size ...................................... 512
  yaml_cfg ........................................ None
-------------------- end of ModelLink Arguments ---------------------
setting number of micro-batches to constant 2
 > building PretrainFromHF tokenizer. Vocab file is un-used, loading tokenizer from pre-trained model
> initializing torch distributed ...
[W ProcessGroupHCCL.cpp:678] Warning: The HCCL execution timeout 7200000ms is bigger than watchdog timeout 1800000ms which is set by init_process_group! The plog may not be recorded. (function ProcessGroupHCCL)
[W ProcessGroupHCCL.cpp:678] Warning: The HCCL execution timeout 7200000ms is bigger than watchdog timeout 1800000ms which is set by init_process_group! The plog may not be recorded. (function ProcessGroupHCCL)
[W ProcessGroupHCCL.cpp:678] Warning: The HCCL execution timeout 7200000ms is bigger than watchdog timeout 1800000ms which is set by init_process_group! The plog may not be recorded. (function ProcessGroupHCCL)
[W ProcessGroupHCCL.cpp:678] Warning: The HCCL execution timeout 7200000ms is bigger than watchdog timeout 1800000ms which is set by init_process_group! The plog may not be recorded. (function ProcessGroupHCCL)
[W ProcessGroupHCCL.cpp:678] Warning: The HCCL execution timeout 7200000ms is bigger than watchdog timeout 1800000ms which is set by init_process_group! The plog may not be recorded. (function ProcessGroupHCCL)
all tp groups [[0, 1, 2, 3, 4, 5, 6, 7], [8, 9, 10, 11, 12, 13, 14, 15], [16, 17, 18, 19, 20, 21, 22, 23], [24, 25, 26, 27, 28, 29, 30, 31], [32, 33, 34, 35, 36, 37, 38, 39], [40, 41, 42, 43, 44, 45, 46, 47], [48, 49, 50, 51, 52, 53, 54, 55], [56, 57, 58, 59, 60, 61, 62, 63], [64, 65, 66, 67, 68, 69, 70, 71], [72, 73, 74, 75, 76, 77, 78, 79], [80, 81, 82, 83, 84, 85, 86, 87], [88, 89, 90, 91, 92, 93, 94, 95], [96, 97, 98, 99, 100, 101, 102, 103], [104, 105, 106, 107, 108, 109, 110, 111], [112, 113, 114, 115, 116, 117, 118, 119], [120, 121, 122, 123, 124, 125, 126, 127], [128, 129, 130, 131, 132, 133, 134, 135], [136, 137, 138, 139, 140, 141, 142, 143], [144, 145, 146, 147, 148, 149, 150, 151], [152, 153, 154, 155, 156, 157, 158, 159], [160, 161, 162, 163, 164, 165, 166, 167], [168, 169, 170, 171, 172, 173, 174, 175], [176, 177, 178, 179, 180, 181, 182, 183], [184, 185, 186, 187, 188, 189, 190, 191], [192, 193, 194, 195, 196, 197, 198, 199], [200, 201, 202, 203, 204, 205, 206, 207], [208, 209, 210, 211, 212, 213, 214, 215], [216, 217, 218, 219, 220, 221, 222, 223], [224, 225, 226, 227, 228, 229, 230, 231], [232, 233, 234, 235, 236, 237, 238, 239], [240, 241, 242, 243, 244, 245, 246, 247], [248, 249, 250, 251, 252, 253, 254, 255], [256, 257, 258, 259, 260, 261, 262, 263], [264, 265, 266, 267, 268, 269, 270, 271], [272, 273, 274, 275, 276, 277, 278, 279], [280, 281, 282, 283, 284, 285, 286, 287], [288, 289, 290, 291, 292, 293, 294, 295], [296, 297, 298, 299, 300, 301, 302, 303], [304, 305, 306, 307, 308, 309, 310, 311], [312, 313, 314, 315, 316, 317, 318, 319], [320, 321, 322, 323, 324, 325, 326, 327], [328, 329, 330, 331, 332, 333, 334, 335], [336, 337, 338, 339, 340, 341, 342, 343], [344, 345, 346, 347, 348, 349, 350, 351], [352, 353, 354, 355, 356, 357, 358, 359], [360, 361, 362, 363, 364, 365, 366, 367], [368, 369, 370, 371, 372, 373, 374, 375], [376, 377, 378, 379, 380, 381, 382, 383], [384, 385, 386, 387, 388, 389, 390, 391], [392, 393, 394, 395, 396, 397, 398, 399], [400, 401, 402, 403, 404, 405, 406, 407], [408, 409, 410, 411, 412, 413, 414, 415], [416, 417, 418, 419, 420, 421, 422, 423], [424, 425, 426, 427, 428, 429, 430, 431], [432, 433, 434, 435, 436, 437, 438, 439], [440, 441, 442, 443, 444, 445, 446, 447], [448, 449, 450, 451, 452, 453, 454, 455], [456, 457, 458, 459, 460, 461, 462, 463], [464, 465, 466, 467, 468, 469, 470, 471], [472, 473, 474, 475, 476, 477, 478, 479], [480, 481, 482, 483, 484, 485, 486, 487], [488, 489, 490, 491, 492, 493, 494, 495], [496, 497, 498, 499, 500, 501, 502, 503], [504, 505, 506, 507, 508, 509, 510, 511]]
all ep groups [[0], [8], [16], [24], [32], [40], [48], [56], [64], [72], [80], [88], [96], [104], [112], [120], [128], [136], [144], [152], [160], [168], [176], [184], [192], [200], [208], [216], [224], [232], [240], [248], [256], [264], [272], [280], [288], [296], [304], [312], [320], [328], [336], [344], [352], [360], [368], [376], [384], [392], [400], [408], [416], [424], [432], [440], [448], [456], [464], [472], [480], [488], [496], [504], [1], [9], [17], [25], [33], [41], [49], [57], [65], [73], [81], [89], [97], [105], [113], [121], [129], [137], [145], [153], [161], [169], [177], [185], [193], [201], [209], [217], [225], [233], [241], [249], [257], [265], [273], [281], [289], [297], [305], [313], [321], [329], [337], [345], [353], [361], [369], [377], [385], [393], [401], [409], [417], [425], [433], [441], [449], [457], [465], [473], [481], [489], [497], [505], [2], [10], [18], [26], [34], [42], [50], [58], [66], [74], [82], [90], [98], [106], [114], [122], [130], [138], [146], [154], [162], [170], [178], [186], [194], [202], [210], [218], [226], [234], [242], [250], [258], [266], [274], [282], [290], [298], [306], [314], [322], [330], [338], [346], [354], [362], [370], [378], [386], [394], [402], [410], [418], [426], [434], [442], [450], [458], [466], [474], [482], [490], [498], [506], [3], [11], [19], [27], [35], [43], [51], [59], [67], [75], [83], [91], [99], [107], [115], [123], [131], [139], [147], [155], [163], [171], [179], [187], [195], [203], [211], [219], [227], [235], [243], [251], [259], [267], [275], [283], [291], [299], [307], [315], [323], [331], [339], [347], [355], [363], [371], [379], [387], [395], [403], [411], [419], [427], [435], [443], [451], [459], [467], [475], [483], [491], [499], [507], [4], [12], [20], [28], [36], [44], [52], [60], [68], [76], [84], [92], [100], [108], [116], [124], [132], [140], [148], [156], [164], [172], [180], [188], [196], [204], [212], [220], [228], [236], [244], [252], [260], [268], [276], [284], [292], [300], [308], [316], [324], [332], [340], [348], [356], [364], [372], [380], [388], [396], [404], [412], [420], [428], [436], [444], [452], [460], [468], [476], [484], [492], [500], [508], [5], [13], [21], [29], [37], [45], [53], [61], [69], [77], [85], [93], [101], [109], [117], [125], [133], [141], [149], [157], [165], [173], [181], [189], [197], [205], [213], [221], [229], [237], [245], [253], [261], [269], [277], [285], [293], [301], [309], [317], [325], [333], [341], [349], [357], [365], [373], [381], [389], [397], [405], [413], [421], [429], [437], [445], [453], [461], [469], [477], [485], [493], [501], [509], [6], [14], [22], [30], [38], [46], [54], [62], [70], [78], [86], [94], [102], [110], [118], [126], [134], [142], [150], [158], [166], [174], [182], [190], [198], [206], [214], [222], [230], [238], [246], [254], [262], [270], [278], [286], [294], [302], [310], [318], [326], [334], [342], [350], [358], [366], [374], [382], [390], [398], [406], [414], [422], [430], [438], [446], [454], [462], [470], [478], [486], [494], [502], [510], [7], [15], [23], [31], [39], [47], [55], [63], [71], [79], [87], [95], [103], [111], [119], [127], [135], [143], [151], [159], [167], [175], [183], [191], [199], [207], [215], [223], [231], [239], [247], [255], [263], [271], [279], [287], [295], [303], [311], [319], [327], [335], [343], [351], [359], [367], [375], [383], [391], [399], [407], [415], [423], [431], [439], [447], [455], [463], [471], [479], [487], [495], [503], [511]]
all dp groups [[0, 16, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, 240, 256, 272, 288, 304, 320, 336, 352, 368, 384, 400, 416, 432, 448, 464, 480, 496], [1, 17, 33, 49, 65, 81, 97, 113, 129, 145, 161, 177, 193, 209, 225, 241, 257, 273, 289, 305, 321, 337, 353, 369, 385, 401, 417, 433, 449, 465, 481, 497], [2, 18, 34, 50, 66, 82, 98, 114, 130, 146, 162, 178, 194, 210, 226, 242, 258, 274, 290, 306, 322, 338, 354, 370, 386, 402, 418, 434, 450, 466, 482, 498], [3, 19, 35, 51, 67, 83, 99, 115, 131, 147, 163, 179, 195, 211, 227, 243, 259, 275, 291, 307, 323, 339, 355, 371, 387, 403, 419, 435, 451, 467, 483, 499], [4, 20, 36, 52, 68, 84, 100, 116, 132, 148, 164, 180, 196, 212, 228, 244, 260, 276, 292, 308, 324, 340, 356, 372, 388, 404, 420, 436, 452, 468, 484, 500], [5, 21, 37, 53, 69, 85, 101, 117, 133, 149, 165, 181, 197, 213, 229, 245, 261, 277, 293, 309, 325, 341, 357, 373, 389, 405, 421, 437, 453, 469, 485, 501], [6, 22, 38, 54, 70, 86, 102, 118, 134, 150, 166, 182, 198, 214, 230, 246, 262, 278, 294, 310, 326, 342, 358, 374, 390, 406, 422, 438, 454, 470, 486, 502], [7, 23, 39, 55, 71, 87, 103, 119, 135, 151, 167, 183, 199, 215, 231, 247, 263, 279, 295, 311, 327, 343, 359, 375, 391, 407, 423, 439, 455, 471, 487, 503], [8, 24, 40, 56, 72, 88, 104, 120, 136, 152, 168, 184, 200, 216, 232, 248, 264, 280, 296, 312, 328, 344, 360, 376, 392, 408, 424, 440, 456, 472, 488, 504], [9, 25, 41, 57, 73, 89, 105, 121, 137, 153, 169, 185, 201, 217, 233, 249, 265, 281, 297, 313, 329, 345, 361, 377, 393, 409, 425, 441, 457, 473, 489, 505], [10, 26, 42, 58, 74, 90, 106, 122, 138, 154, 170, 186, 202, 218, 234, 250, 266, 282, 298, 314, 330, 346, 362, 378, 394, 410, 426, 442, 458, 474, 490, 506], [11, 27, 43, 59, 75, 91, 107, 123, 139, 155, 171, 187, 203, 219, 235, 251, 267, 283, 299, 315, 331, 347, 363, 379, 395, 411, 427, 443, 459, 475, 491, 507], [12, 28, 44, 60, 76, 92, 108, 124, 140, 156, 172, 188, 204, 220, 236, 252, 268, 284, 300, 316, 332, 348, 364, 380, 396, 412, 428, 444, 460, 476, 492, 508], [13, 29, 45, 61, 77, 93, 109, 125, 141, 157, 173, 189, 205, 221, 237, 253, 269, 285, 301, 317, 333, 349, 365, 381, 397, 413, 429, 445, 461, 477, 493, 509], [14, 30, 46, 62, 78, 94, 110, 126, 142, 158, 174, 190, 206, 222, 238, 254, 270, 286, 302, 318, 334, 350, 366, 382, 398, 414, 430, 446, 462, 478, 494, 510], [15, 31, 47, 63, 79, 95, 111, 127, 143, 159, 175, 191, 207, 223, 239, 255, 271, 287, 303, 319, 335, 351, 367, 383, 399, 415, 431, 447, 463, 479, 495, 511]]
all_dp_modulo_exp_group_ranks [[0, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 256, 264, 272, 280, 288, 296, 304, 312, 320, 328, 336, 344, 352, 360, 368, 376, 384, 392, 400, 408, 416, 424, 432, 440, 448, 456, 464, 472, 480, 488, 496, 504], [1, 9, 17, 25, 33, 41, 49, 57, 65, 73, 81, 89, 97, 105, 113, 121, 129, 137, 145, 153, 161, 169, 177, 185, 193, 201, 209, 217, 225, 233, 241, 249, 257, 265, 273, 281, 289, 297, 305, 313, 321, 329, 337, 345, 353, 361, 369, 377, 385, 393, 401, 409, 417, 425, 433, 441, 449, 457, 465, 473, 481, 489, 497, 505], [2, 10, 18, 26, 34, 42, 50, 58, 66, 74, 82, 90, 98, 106, 114, 122, 130, 138, 146, 154, 162, 170, 178, 186, 194, 202, 210, 218, 226, 234, 242, 250, 258, 266, 274, 282, 290, 298, 306, 314, 322, 330, 338, 346, 354, 362, 370, 378, 386, 394, 402, 410, 418, 426, 434, 442, 450, 458, 466, 474, 482, 490, 498, 506], [3, 11, 19, 27, 35, 43, 51, 59, 67, 75, 83, 91, 99, 107, 115, 123, 131, 139, 147, 155, 163, 171, 179, 187, 195, 203, 211, 219, 227, 235, 243, 251, 259, 267, 275, 283, 291, 299, 307, 315, 323, 331, 339, 347, 355, 363, 371, 379, 387, 395, 403, 411, 419, 427, 435, 443, 451, 459, 467, 475, 483, 491, 499, 507], [4, 12, 20, 28, 36, 44, 52, 60, 68, 76, 84, 92, 100, 108, 116, 124, 132, 140, 148, 156, 164, 172, 180, 188, 196, 204, 212, 220, 228, 236, 244, 252, 260, 268, 276, 284, 292, 300, 308, 316, 324, 332, 340, 348, 356, 364, 372, 380, 388, 396, 404, 412, 420, 428, 436, 444, 452, 460, 468, 476, 484, 492, 500, 508], [5, 13, 21, 29, 37, 45, 53, 61, 69, 77, 85, 93, 101, 109, 117, 125, 133, 141, 149, 157, 165, 173, 181, 189, 197, 205, 213, 221, 229, 237, 245, 253, 261, 269, 277, 285, 293, 301, 309, 317, 325, 333, 341, 349, 357, 365, 373, 381, 389, 397, 405, 413, 421, 429, 437, 445, 453, 461, 469, 477, 485, 493, 501, 509], [6, 14, 22, 30, 38, 46, 54, 62, 70, 78, 86, 94, 102, 110, 118, 126, 134, 142, 150, 158, 166, 174, 182, 190, 198, 206, 214, 222, 230, 238, 246, 254, 262, 270, 278, 286, 294, 302, 310, 318, 326, 334, 342, 350, 358, 366, 374, 382, 390, 398, 406, 414, 422, 430, 438, 446, 454, 462, 470, 478, 486, 494, 502, 510], [7, 15, 23, 31, 39, 47, 55, 63, 71, 79, 87, 95, 103, 111, 119, 127, 135, 143, 151, 159, 167, 175, 183, 191, 199, 207, 215, 223, 231, 239, 247, 255, 263, 271, 279, 287, 295, 303, 311, 319, 327, 335, 343, 351, 359, 367, 375, 383, 391, 399, 407, 415, 423, 431, 439, 447, 455, 463, 471, 479, 487, 495, 503, 511]]
all_tensor_and_expert_group_ranks [[0, 1, 2, 3, 4, 5, 6, 7], [8, 9, 10, 11, 12, 13, 14, 15], [16, 17, 18, 19, 20, 21, 22, 23], [24, 25, 26, 27, 28, 29, 30, 31], [32, 33, 34, 35, 36, 37, 38, 39], [40, 41, 42, 43, 44, 45, 46, 47], [48, 49, 50, 51, 52, 53, 54, 55], [56, 57, 58, 59, 60, 61, 62, 63], [64, 65, 66, 67, 68, 69, 70, 71], [72, 73, 74, 75, 76, 77, 78, 79], [80, 81, 82, 83, 84, 85, 86, 87], [88, 89, 90, 91, 92, 93, 94, 95], [96, 97, 98, 99, 100, 101, 102, 103], [104, 105, 106, 107, 108, 109, 110, 111], [112, 113, 114, 115, 116, 117, 118, 119], [120, 121, 122, 123, 124, 125, 126, 127], [128, 129, 130, 131, 132, 133, 134, 135], [136, 137, 138, 139, 140, 141, 142, 143], [144, 145, 146, 147, 148, 149, 150, 151], [152, 153, 154, 155, 156, 157, 158, 159], [160, 161, 162, 163, 164, 165, 166, 167], [168, 169, 170, 171, 172, 173, 174, 175], [176, 177, 178, 179, 180, 181, 182, 183], [184, 185, 186, 187, 188, 189, 190, 191], [192, 193, 194, 195, 196, 197, 198, 199], [200, 201, 202, 203, 204, 205, 206, 207], [208, 209, 210, 211, 212, 213, 214, 215], [216, 217, 218, 219, 220, 221, 222, 223], [224, 225, 226, 227, 228, 229, 230, 231], [232, 233, 234, 235, 236, 237, 238, 239], [240, 241, 242, 243, 244, 245, 246, 247], [248, 249, 250, 251, 252, 253, 254, 255], [256, 257, 258, 259, 260, 261, 262, 263], [264, 265, 266, 267, 268, 269, 270, 271], [272, 273, 274, 275, 276, 277, 278, 279], [280, 281, 282, 283, 284, 285, 286, 287], [288, 289, 290, 291, 292, 293, 294, 295], [296, 297, 298, 299, 300, 301, 302, 303], [304, 305, 306, 307, 308, 309, 310, 311], [312, 313, 314, 315, 316, 317, 318, 319], [320, 321, 322, 323, 324, 325, 326, 327], [328, 329, 330, 331, 332, 333, 334, 335], [336, 337, 338, 339, 340, 341, 342, 343], [344, 345, 346, 347, 348, 349, 350, 351], [352, 353, 354, 355, 356, 357, 358, 359], [360, 361, 362, 363, 364, 365, 366, 367], [368, 369, 370, 371, 372, 373, 374, 375], [376, 377, 378, 379, 380, 381, 382, 383], [384, 385, 386, 387, 388, 389, 390, 391], [392, 393, 394, 395, 396, 397, 398, 399], [400, 401, 402, 403, 404, 405, 406, 407], [408, 409, 410, 411, 412, 413, 414, 415], [416, 417, 418, 419, 420, 421, 422, 423], [424, 425, 426, 427, 428, 429, 430, 431], [432, 433, 434, 435, 436, 437, 438, 439], [440, 441, 442, 443, 444, 445, 446, 447], [448, 449, 450, 451, 452, 453, 454, 455], [456, 457, 458, 459, 460, 461, 462, 463], [464, 465, 466, 467, 468, 469, 470, 471], [472, 473, 474, 475, 476, 477, 478, 479], [480, 481, 482, 483, 484, 485, 486, 487], [488, 489, 490, 491, 492, 493, 494, 495], [496, 497, 498, 499, 500, 501, 502, 503], [504, 505, 506, 507, 508, 509, 510, 511]]
all_data_parallel_group_ranks_with_cp [[0, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 256, 264, 272, 280, 288, 296, 304, 312, 320, 328, 336, 344, 352, 360, 368, 376, 384, 392, 400, 408, 416, 424, 432, 440, 448, 456, 464, 472, 480, 488, 496, 504], [1, 9, 17, 25, 33, 41, 49, 57, 65, 73, 81, 89, 97, 105, 113, 121, 129, 137, 145, 153, 161, 169, 177, 185, 193, 201, 209, 217, 225, 233, 241, 249, 257, 265, 273, 281, 289, 297, 305, 313, 321, 329, 337, 345, 353, 361, 369, 377, 385, 393, 401, 409, 417, 425, 433, 441, 449, 457, 465, 473, 481, 489, 497, 505], [2, 10, 18, 26, 34, 42, 50, 58, 66, 74, 82, 90, 98, 106, 114, 122, 130, 138, 146, 154, 162, 170, 178, 186, 194, 202, 210, 218, 226, 234, 242, 250, 258, 266, 274, 282, 290, 298, 306, 314, 322, 330, 338, 346, 354, 362, 370, 378, 386, 394, 402, 410, 418, 426, 434, 442, 450, 458, 466, 474, 482, 490, 498, 506], [3, 11, 19, 27, 35, 43, 51, 59, 67, 75, 83, 91, 99, 107, 115, 123, 131, 139, 147, 155, 163, 171, 179, 187, 195, 203, 211, 219, 227, 235, 243, 251, 259, 267, 275, 283, 291, 299, 307, 315, 323, 331, 339, 347, 355, 363, 371, 379, 387, 395, 403, 411, 419, 427, 435, 443, 451, 459, 467, 475, 483, 491, 499, 507], [4, 12, 20, 28, 36, 44, 52, 60, 68, 76, 84, 92, 100, 108, 116, 124, 132, 140, 148, 156, 164, 172, 180, 188, 196, 204, 212, 220, 228, 236, 244, 252, 260, 268, 276, 284, 292, 300, 308, 316, 324, 332, 340, 348, 356, 364, 372, 380, 388, 396, 404, 412, 420, 428, 436, 444, 452, 460, 468, 476, 484, 492, 500, 508], [5, 13, 21, 29, 37, 45, 53, 61, 69, 77, 85, 93, 101, 109, 117, 125, 133, 141, 149, 157, 165, 173, 181, 189, 197, 205, 213, 221, 229, 237, 245, 253, 261, 269, 277, 285, 293, 301, 309, 317, 325, 333, 341, 349, 357, 365, 373, 381, 389, 397, 405, 413, 421, 429, 437, 445, 453, 461, 469, 477, 485, 493, 501, 509], [6, 14, 22, 30, 38, 46, 54, 62, 70, 78, 86, 94, 102, 110, 118, 126, 134, 142, 150, 158, 166, 174, 182, 190, 198, 206, 214, 222, 230, 238, 246, 254, 262, 270, 278, 286, 294, 302, 310, 318, 326, 334, 342, 350, 358, 366, 374, 382, 390, 398, 406, 414, 422, 430, 438, 446, 454, 462, 470, 478, 486, 494, 502, 510], [7, 15, 23, 31, 39, 47, 55, 63, 71, 79, 87, 95, 103, 111, 119, 127, 135, 143, 151, 159, 167, 175, 183, 191, 199, 207, 215, 223, 231, 239, 247, 255, 263, 271, 279, 287, 295, 303, 311, 319, 327, 335, 343, 351, 359, 367, 375, 383, 391, 399, 407, 415, 423, 431, 439, 447, 455, 463, 471, 479, 487, 495, 503, 511]]
> initialized tensor model parallel with size 8
> initialized pipeline model parallel with size 1
> setting random seeds to 424242 ...
> compiling dataset index builder ...
make: Entering directory 'local_disk/cognitron_vl/third_party/Megatron-LM_core_r0.6.0/megatron/core/datasets'
make: Nothing to be done for 'default'.
make: Leaving directory 'local_disk/cognitron_vl/third_party/Megatron-LM_core_r0.6.0/megatron/core/datasets'
>>> done with dataset index builder. Compilation time: 0.074 seconds
time to initialize megatron (seconds): -34.653
[after megatron is initialized] datetime: 2024-11-27 12:45:17 
building GPT model ...
Building megatron mcore vision language model ...
vision_projector_recompute False
vision_projector_recompute False
vision_projector_recompute False
vision_projector_recompute False
vision_projector_recompute Falsevision_projector_recompute Falsevision_projector_recompute False


vision_projector_recompute False
vision_projector_recompute False
vision_projector_recompute False
vision_projector_recompute False
vision_projector_recompute False
vision_projector_recompute False
model_provider args Namespace(accumulate_allreduce_grads_in_fp32=True, adam_beta1=0.9, adam_beta2=0.999, adam_eps=1e-08, adaptive_cp_dynamic_attn_mask=False, adaptive_cp_manually_set_mask_list=False, adaptive_cp_only_reschedule=False, adaptive_cp_without_coarse=False, adaptive_recompute_device_size=-1, adaptive_recompute_device_swap=False, adaptive_recompute_profiling_step=10, add_bias_linear=False, add_class_token=True, add_dense_bias=False, add_position_embedding=True, add_qkv_bias=True, add_rmsnorm_offset=False, adlr_autoresume=False, adlr_autoresume_interval=1000, apply_layernorm_1p=False, apply_query_key_layer_scaling=False, apply_residual_connection_post_layernorm=False, apply_rope_fusion=True, async_tensor_model_parallel_allreduce=False, attention_dropout=0.0, attention_mask_on_cpu=False, attention_softmax_in_fp32=True, attn_logit_softcapping=None, auto_detect_ckpt_format=False, barrier_with_L1_time=True, bert_binary_head=True, bert_embedder_type='megatron', bert_load=None, bf16=True, bias_dropout_fusion=True, bias_gelu_fusion=False, bias_swiglu_fusion=True, biencoder_projection_dim=0, biencoder_shared_query_context_model=False, block_data_path=None, check_for_nan_in_loss_and_grad=True, chunk_size=4096, ckpt_fully_parallel_save=False, ckpt_step=None, classes_fraction=1.0, clip_grad=1.0, clone_scatter_output_in_embedding=True, consumed_train_samples=0, consumed_valid_samples=0, context_parallel_algo='megatron_cp_algo', context_parallel_size=2, cp_attention_mask_type='causal', cp_window_size=1, create_attention_mask_in_dataloader=False, cross_dataset_joint=True, data_cache_path=None, data_parallel_random_init=False, data_parallel_size=32, data_path=['/local_disk/cognitron_vl//configs/lcvlm_finetune_stage3.yaml'], data_per_class_fraction=1.0, data_seq_length=131072, data_sharding=True, dataloader_type='single', decoder_num_layers=None, decoder_seq_length=None, decoupled_lr=None, decoupled_min_lr=None, delay_grad_reduce=True, delay_param_gather=False, dim_model_base=None, dino_bottleneck_size=256, dino_freeze_last_layer=1, dino_head_hidden_size=2048, dino_local_crops_number=10, dino_local_img_size=96, dino_norm_last_layer=False, dino_teacher_temp=0.07, dino_warmup_teacher_temp=0.04, dino_warmup_teacher_temp_epochs=30, dist_ckpt_format='torch_dist', distribute_saved_activations=False, distributed_backend='nccl', distributed_timeout_minutes=120, dpo_beta=0.1, dpo_ftx=0.0, dpo_label_smoothing=0.0, dpo_loss_type='sigmoid', embed_layernorm=False, embedding_multiplier_scale=1.0, embedding_path=None, empty_unused_memory_level=0, enable_chunk_memory=False, enable_chunk_sequence=False, enable_hbmfault_repair=False, enable_high_availability=False, enable_one_logger=False, enable_optimizer_state_local_copy=False, enable_recompute_layers_per_pp_rank=False, enable_token_rearrange_opt=False, encoder_num_layers=48, encoder_seq_length=131072, end_weight_decay=0.0, eod_mask_loss=False, eval_interval=100, eval_iters=0, evidence_data_path=None, exit_duration_in_mins=None, exit_interval=None, exit_on_missing_checkpoint=False, exit_signal_handler=False, expert_interval=1, expert_model_parallel_size=1, ffn_hidden_size=13824, fill_neg_inf=False, finetune=True, first_k_dense_replace=None, first_pipeline_num_layers=0, fp16=False, fp16_lm_cross_entropy=False, fp32_residual_connection=False, fp8=None, fp8_amax_compute_algo='most_recent', fp8_amax_history_len=1, fp8_interval=1, fp8_margin=0, fp8_wgrad=True, full_shuffle_instruction_dataset=False, geglu=False, gelu_tanh=False, global_batch_size=64, gradient_accumulation_fusion=False, group_query_attention=True, head_lr_mult=1.0, hidden_dropout=0.0, hidden_size=5120, high_freq_factor=None, hysteresis=2, ict_head_size=None, ict_load=None, image_size=448, image_token_length=256, img_h=224, img_w=224, independent_parallel=False, indexer_batch_size=128, indexer_log_interval=1000, inference_batch_times_seqlen_threshold=512, init_method_std=0.01, init_method_xavier_uniform=False, initial_loss_scale=4096.0, input_embeds_norm=False, input_jitter=True, input_layernorm_in_fp32=False, interleave_sliding_window=None, is_instruction_dataset=True, is_pairwise_dataset=False, iter_per_epoch=1250, jit_compile=False, kv_channels=128, kv_head_repeat_before_uly_alltoall=True, kv_lora_rank=None, language_model_freeze=False, lazy_mpu_init=None, load='/data_2/output/LM/lcvlm_modellink/scripts/qwen25/finetune_qwen25_14b_intern_300m_ptd_tp8pp1_stage2.sh/20241014_131952/', load_checkpoint_loosely=False, local_rank=0, log_batch_size_to_tensorboard=False, log_interval=1, log_learning_rate_to_tensorboard=True, log_loss_scale_to_tensorboard=True, log_memory_to_tensorboard=False, log_num_zeros_in_grad=False, log_params_norm=False, log_progress=False, log_throughput=True, log_timers_to_tensorboard=False, log_validation_ppl_to_tensorboard=False, log_world_size_to_tensorboard=False, logit_mask=False, lora_alpha=32, lora_fusion=False, lora_load=None, lora_modules_to_save=None, lora_r=16, lora_register_forward_hook=['word_embeddings', 'input_layernorm'], lora_target_modules=[], loss_scale=None, loss_scale_window=1000, low_freq_factor=None, lr=5e-06, lr_decay_iters=None, lr_decay_samples=None, lr_decay_style='cosine', lr_warmup_fraction=0.03, lr_warmup_init=0.0, lr_warmup_iters=0, lr_warmup_samples=0, make_vocab_size_divisible_by=1, manual_gc=False, manual_gc_eval=True, manual_gc_interval=0, mask_factor=1.0, mask_prob=0.15, mask_type='random', masked_softmax_fusion=False, max_fps=1, max_num_frame=512, max_num_image=8, max_patch_grid=12, max_position_embeddings=131072, max_tokens_to_oom=12000, merge_file=None, micro_batch_size=1, min_loss_scale=1.0, min_lr=1e-07, min_patch_grid=1, mmap_bin_files=True, mock_data=False, model_type=<ModelType.encoder_or_decoder: 1>, moe_allgather_overlap_comm=False, moe_alltoall_overlap_comm=False, moe_aux_loss_coeff=0.0, moe_comm_aux_loss_coeff=0.0, moe_device_level_aux_loss_coeff=0.0, moe_expert_capacity_factor=None, moe_grouped_gemm=False, moe_input_jitter_eps=None, moe_intermediate_size=None, moe_layer_freq=None, moe_pad_expert_input_to_capacity=False, moe_per_layer_logging=False, moe_permutation_async_comm=False, moe_router_load_balancing_type='aux_loss', moe_router_topk=2, moe_token_dispatcher_type='allgather', moe_token_drop_policy='probs', moe_token_dropping=False, moe_tp_extend_ep=False, moe_train_capacity_factor=1.0, moe_without_activation=False, moe_z_loss_coeff=0.0, moe_zero_memory='disable', multi_head_latent_attention=False, n_shared_experts=None, nccl_communicator_config_path=None, next_tockens=0, no_load_optim=True, no_load_rng=True, no_persist_layer_norm=False, no_post_layer_norm=False, no_save_optim=None, no_save_rng=None, no_shared_storage=False, no_shuffle=False, noisy_gate_policy=None, noop_layers=None, norm_epsilon=1e-06, norm_topk_prob=False, normalization='RMSNorm', num_attention_heads=40, num_channels=3, num_classes=1000, num_experts=None, num_layer_list=None, num_layers=48, num_layers_per_virtual_pipeline_stage=None, num_query_groups=8, num_workers=8, one_logger_entity='hwinf_dcm', one_logger_project='e2e-tracking', one_logger_run_name=None, onnx_safe=None, openai_gelu=False, optimizer='adam', original_max_position_embeddings=None, output_bert_embeddings=False, output_layer_slice_num=1, output_logit_softcapping=None, output_multiplier_scale=None, overlap_grad_reduce=True, overlap_p2p_comm=False, overlap_param_gather=False, override_opt_param_scheduler=False, pad_to_multiple_of=8, padded_vocab_size=152064, params_dtype=torch.bfloat16, patch_dim=16, perform_initialization=True, pipeline_model_parallel_size=1, pipeline_model_parallel_split_rank=None, position_embedding_type='rope', post_norm=False, pre_tockens=65536, pref_ftx=0.0, pretrained_checkpoint=None, profile=False, profile_level='level0', profile_ranks=[-1], profile_record_shapes=False, profile_save_path='./profile_dir', profile_step_end=12, profile_step_start=10, profile_with_cpu=False, profile_with_memory=False, profile_with_stack=False, prompt_format='qwen2', prompt_type=None, q_lora_rank=None, qk_layernorm=False, qk_nope_head_dim=None, qk_rope_head_dim=None, query_in_block_prob=0.1, query_pre_attn_scalar=None, rampup_batch_size=None, rank=0, recompute_activation_function=False, recompute_activation_function_num_layers=None, recompute_granularity='full', recompute_in_advance=False, recompute_in_bubble=False, recompute_method='block', recompute_num_layers=48, reduce_recompute_for_last_chunk=False, ref_model=None, reset_attention_mask=False, reset_position_ids=False, retriever_report_topk_accuracies=[], retriever_score_scaling=False, retriever_seq_length=256, retro_add_retriever=False, retro_attention_gate=1, retro_cyclic_train_iters=None, retro_encoder_attention_dropout=0.1, retro_encoder_hidden_dropout=0.1, retro_encoder_layers=2, retro_num_neighbors=2, retro_num_retrieved_chunks=2, retro_project_dir=None, retro_verify_neighbor_count=True, reuse_fp32_param=False, rope_scaling_beta_fast=32, rope_scaling_beta_slow=1, rope_scaling_factor=1.0, rope_scaling_mscale=1.0, rope_scaling_mscale_all_dim=0.0, rope_scaling_original_max_position_embeddings=None, rope_scaling_type=None, rotary_base=1000000.0, rotary_interleaved=False, rotary_percent=1.0, rotary_seq_len_interpolation_factor=None, routed_scaling_factor=None, sample_rate=1.0, save='/data_2/output/LM/scripts/modellink/qwen25/finetune_qwen25_14b_intern_300m_ptd_tp8pp1cp2_stage3.sh/20241127_204213//', save_interval=100, scale_depth=None, scale_emb=None, scatter_gather_tensors_in_pipeline=True, seed=424242, seq_aux=False, seq_length=131072, sequence_parallel=True, sgd_momentum=0.9, shape_order='SBH', shared_expert_gate=False, shared_expert_gate_output_dimension=1, short_seq_prob=0.1, skip_bias_add=True, skip_train=False, sliding_window=None, sparse_mode=4, spec=None, split='100,0,0', square_alibi_mask=False, squared_relu=False, stage=None, standalone_embedding_stage=False, start_weight_decay=0.0, swap_attention=False, swap_modules=None, swiglu=True, swin_backbone_type='tiny', tensor_model_parallel_size=8, tensorboard_dir=None, tensorboard_log_interval=1, tensorboard_queue_size=1000, test_data_path=None, test_mode=False, timing_log_level=0, timing_log_option='minmax', titles_data_path=None, tokenizer_kwargs=None, tokenizer_model=None, tokenizer_name_or_path='/data_4/models/Qwen/Qwen2.5-14B-Instruct/', tokenizer_not_use_fast=True, tokenizer_padding_side='right', tokenizer_type='PretrainedFromHF', topk_group=None, tp_2d=False, tp_comm_bulk_dgrad=True, tp_comm_bulk_wgrad=True, tp_comm_overlap=False, tp_comm_overlap_ag=True, tp_comm_overlap_cfg=None, tp_comm_overlap_rs=True, tp_comm_split_ag=True, tp_comm_split_rs=True, tp_x=1, tp_y=1, train_data_path=None, train_iters=1000, train_samples=None, transformer_impl='local', transformer_pipeline_model_parallel_size=1, ulysses_degree_in_cp=None, untie_embeddings_and_output_weights=True, use_checkpoint_args=False, use_checkpoint_opt_param_scheduler=False, use_cp_send_recv_overlap=True, use_cpu_initialization=None, use_deter_comp=False, use_dist_ckpt=False, use_distributed_optimizer=True, use_flash_attn=True, use_fused_moe_token_permute_and_unpermute=False, use_fused_ring_attention_update=False, use_fused_rmsnorm=True, use_fused_rotary_pos_emb=True, use_fused_swiglu=True, use_glm_rope=False, use_mc2=False, use_mcore_models=True, use_one_sent_docs=False, use_ring_exchange_p2p=False, use_rotary_position_embeddings=True, v_head_dim=None, valid_data_path=None, variable_seq_lengths=False, virtual_pipeline_model_parallel_size=None, vision_backbone_type='vit', vision_context_parallel=False, vision_downsample_ratio=0.5, vision_downsample_stride=1.0, vision_model_freeze=True, vision_model_lr_decay_rate=1.0, vision_model_lr_mult=1.0, vision_model_recompute=False, vision_model_type='intern_300m', vision_normalize_type='imagenet', vision_pretraining=False, vision_pretraining_type='classify', vision_process_type='dynamic', vision_projector_freeze=False, vision_projector_pre_norm=True, vision_projector_recompute=False, vision_projector_type='mlp', vision_seq_length=1025, vit_load='/', vocab_extra_ids=0, vocab_file=None, vocab_size=None, wandb_exp_name='', wandb_project='', wandb_save_dir='', weight_decay=0.0, weight_decay_incr_style='constant', world_size=512, yaml_cfg=None)vision_projector_recompute False

model_provider config TransformerConfig(tensor_model_parallel_size=8, pipeline_model_parallel_size=1, virtual_pipeline_model_parallel_size=None, sequence_parallel=True, context_parallel_size=2, expert_model_parallel_size=1, perform_initialization=True, use_cpu_initialization=None, fp16=False, bf16=True, params_dtype=torch.bfloat16, timers=None, finalize_model_grads_func=None, grad_scale_func=None, no_sync_func=None, grad_sync_func=None, param_sync_func=None, enable_autocast=False, autocast_dtype=torch.bfloat16, num_microbatches_with_partial_activation_checkpoints=None, gradient_accumulation_fusion=False, async_tensor_model_parallel_allreduce=False, tp_comm_overlap=False, tp_comm_bulk_wgrad=True, tp_comm_bulk_dgrad=True, tp_comm_overlap_ag=True, tp_comm_overlap_rs=True, tp_comm_split_ag=True, tp_comm_atomic_ag=False, tp_comm_split_rs=True, tp_comm_atomic_rs=False, pipeline_dtype=torch.bfloat16, variable_seq_lengths=False, overlap_p2p_comm=False, batch_p2p_comm=False, batch_p2p_sync=True, use_ring_exchange_p2p=False, deallocate_pipeline_outputs=True, defer_embedding_wgrad_compute=False, pipeline_model_parallel_split_rank=None, cpu_offloading=False, cpu_offloading_num_layers=0, _cpu_offloading_context=None, cpu_offloading_activations=True, cpu_offloading_weights=True, barrier_with_L1_time=True, num_layers=48, first_pipeline_num_layers=0, independent_parallel=False, hidden_size=5120, num_attention_heads=40, num_query_groups=8, ffn_hidden_size=13824, kv_channels=128, hidden_dropout=0.0, attention_dropout=0.0, fp32_residual_connection=False, apply_residual_connection_post_layernorm=False, layernorm_epsilon=1e-06, layernorm_zero_centered_gamma=False, add_bias_linear=False, add_qkv_bias=True, gated_linear_unit=True, activation_func=<function silu at 0x7fb291a70940>, activation_func_fp8_input_store=False, num_moe_experts=None, rotary_interleaved=False, window_size=None, normalization='RMSNorm', qk_layernorm=False, test_mode=False, init_method=<function init_method_normal.<locals>.init_ at 0x7fb2591db310>, output_layer_init_method=<function scaled_init_method_normal.<locals>.init_ at 0x7fb23001b310>, init_method_std=0.01, apply_query_key_layer_scaling=False, attention_softmax_in_fp32=True, bias_activation_fusion=True, masked_softmax_fusion=False, persist_layer_norm=True, memory_efficient_layer_norm=False, bias_dropout_fusion=True, apply_rope_fusion=True, recompute_granularity='full', recompute_method='block', recompute_num_layers=48, distribute_saved_activations=False, fp8=None, fp8_margin=0, fp8_interval=1, fp8_amax_history_len=1, fp8_amax_compute_algo='most_recent', fp8_wgrad=True, fp8_dot_product_attention=False, fp8_multi_head_attention=False, moe_router_load_balancing_type='aux_loss', moe_router_topk=2, moe_grouped_gemm=False, moe_aux_loss_coeff=0.0, moe_z_loss_coeff=0.0, moe_input_jitter_eps=None, moe_token_dropping=False, moe_token_dispatcher_type='allgather', moe_per_layer_logging=False, moe_expert_capacity_factor=None, moe_pad_expert_input_to_capacity=False, moe_token_drop_policy='probs', moe_layer_recompute=False, clone_scatter_output_in_embedding=True, disable_parameter_transpose_cache=False, enable_cuda_graph=False, max_position_embeddings=131072, rotary_percent=1.0)
vision_projector_recompute Falsemodel_provider transformer_layer_spec ModuleSpec(module=<class 'megatron.core.transformer.transformer_layer.TransformerLayer'>, params={}, submodules=TransformerLayerSubmodules(input_layernorm=<class 'modellink.core.transformer.custom_layers.transformer_engine.PTNorm'>, self_attention=ModuleSpec(module=<class 'megatron.core.transformer.attention.SelfAttention'>, params={'attn_mask_type': <AttnMaskType.causal: 2>}, submodules=SelfAttentionSubmodules(linear_qkv=<class 'megatron.core.tensor_parallel.layers.ColumnParallelLinear'>, core_attention=<class 'megatron.core.transformer.dot_product_attention.DotProductAttention'>, linear_proj=<class 'megatron.core.tensor_parallel.layers.RowParallelLinear'>, q_layernorm=<class 'megatron.core.transformer.identity_op.IdentityOp'>, k_layernorm=<class 'megatron.core.transformer.identity_op.IdentityOp'>)), post_attn_norm=<class 'megatron.core.transformer.identity_op.IdentityOp'>, self_attn_bda=<function get_bias_dropout_add at 0x7fb27185ddc0>, pre_cross_attn_layernorm=<class 'megatron.core.transformer.identity_op.IdentityOp'>, cross_attention=<class 'megatron.core.transformer.identity_op.IdentityOp'>, cross_attn_bda=<class 'megatron.core.transformer.identity_op.IdentityFuncOp'>, pre_mlp_layernorm=<class 'modellink.core.transformer.custom_layers.transformer_engine.PTNorm'>, mlp=ModuleSpec(module=<class 'megatron.core.transformer.mlp.MLP'>, params={}, submodules=MLPSubmodules(linear_fc1=<class 'megatron.core.tensor_parallel.layers.ColumnParallelLinear'>, linear_fc2=<class 'megatron.core.tensor_parallel.layers.RowParallelLinear'>)), post_mlp_layernorm=<class 'megatron.core.transformer.identity_op.IdentityOp'>, mlp_bda=<function get_bias_dropout_add at 0x7fb27185ddc0>, sharded_state_dict_keys_map={'input_layernorm.': 'self_attention.linear_qkv.layer_norm_', 'pre_mlp_layernorm.': 'mlp.linear_fc1.layer_norm_'}))

vision_projector_recompute False
Building intern_300m model ...
------------------------ vit_args ------------------------
  accumulate_allreduce_grads_in_fp32 .............. True
  activation_func ................................. <built-in function gelu>
  adam_beta1 ...................................... 0.9
  adam_beta2 ...................................... 0.999
  adam_eps ........................................ 1e-08
  adaptive_cp_dynamic_attn_mask ................... False
  adaptive_cp_manually_set_mask_list .............. False
  adaptive_cp_only_reschedule ..................... False
  adaptive_cp_without_coarse ...................... False
  adaptive_recompute_device_size .................. -1
  adaptive_recompute_device_swap .................. False
  adaptive_recompute_profiling_step ............... 10
  add_bias_linear ................................. True
  add_class_token ................................. True
  add_dense_bias .................................. False
  add_position_embedding .......................... True
  add_qkv_bias .................................... True
  add_rmsnorm_offset .............................. False
  adlr_autoresume ................................. False
  adlr_autoresume_interval ........................ 1000
  apply_layernorm_1p .............................. False
  apply_query_key_layer_scaling ................... False
  apply_residual_connection_post_layernorm ........ False
  apply_rope_fusion ............................... False
  async_tensor_model_parallel_allreduce ........... False
  attention_dropout ............................... 0.0
  attention_mask_on_cpu ........................... False
  attention_softmax_in_fp32 ....................... True
  attn_logit_softcapping .......................... None
  auto_detect_ckpt_format ......................... False
  barrier_with_L1_time ............................ True
  bert_binary_head ................................ True
  bert_embedder_type .............................. megatron
  bert_load ....................................... None
  bf16 ............................................ True
  bias_activation_fusion .......................... False
  bias_dropout_fusion ............................. False
  bias_gelu_fusion ................................ False
  bias_swiglu_fusion .............................. True
  biencoder_projection_dim ........................ 0
  biencoder_shared_query_context_model ............ False
  block_data_path ................................. None
  check_for_nan_in_loss_and_grad .................. True
  chunk_size ...................................... 4096
  ckpt_fully_parallel_save ........................ False
  ckpt_step ....................................... None
  classes_fraction ................................ 1.0
  clip_grad ....................................... 1.0
  clone_scatter_output_in_embedding ............... True
  consumed_train_samples .......................... 0
  consumed_valid_samples .......................... 0
  context_parallel_algo ........................... megatron_cp_algo
  context_parallel_size ........................... 1
  cp_attention_mask_type .......................... causal
  cp_window_size .................................. 1
  create_attention_mask_in_dataloader ............. False
  cross_dataset_joint ............................. True
  data_cache_path ................................. None
  data_parallel_random_init ....................... False
  data_parallel_size .............................. 32
  data_path ....................................... ['/local_disk/cognitron_vl//configs/lcvlm_finetune_stage3.yaml']
  data_per_class_fraction ......................... 1.0
  data_seq_length ................................. 131072
  data_sharding ................................... True
  dataloader_type ................................. single
  decoder_num_layers .............................. None
  decoder_seq_length .............................. 1025
  decoupled_lr .................................... None
  decoupled_min_lr ................................ None
  delay_grad_reduce ............................... True
  delay_param_gather .............................. False
  dim_model_base .................................. None
  dino_bottleneck_size ............................ 256
  dino_freeze_last_layer .......................... 1
  dino_head_hidden_size ........................... 2048
  dino_local_crops_number ......................... 10
  dino_local_img_size ............................. 96
  dino_norm_last_layer ............................ False
  dino_teacher_temp ............................... 0.07
  dino_warmup_teacher_temp ........................ 0.04
  dino_warmup_teacher_temp_epochs ................. 30
  dist_ckpt_format ................................ torch_dist
  distribute_saved_activations .................... False
  distributed_backend ............................. nccl
  distributed_timeout_minutes ..................... 120
  dpo_beta ........................................ 0.1
  dpo_ftx ......................................... 0.0
  dpo_label_smoothing ............................. 0.0
  dpo_loss_type ................................... sigmoid
  embed_layernorm ................................. False
  embedding_multiplier_scale ...................... 1.0
  embedding_path .................................. None
  empty_unused_memory_level ....................... 0
  enable_chunk_memory ............................. False
  enable_chunk_sequence ........................... False
  enable_hbmfault_repair .......................... False
  enable_high_availability ........................ False
  enable_one_logger ............................... False
  enable_optimizer_state_local_copy ............... False
  enable_recompute_layers_per_pp_rank ............. False
  enable_token_rearrange_opt ...................... False
  encoder_num_layers .............................. 48
  encoder_seq_length .............................. 1025
  end_weight_decay ................................ 0.0
  eod_mask_loss ................................... False
  eval_interval ................................... 100
  eval_iters ...................................... 0
  evidence_data_path .............................. None
  exit_duration_in_mins ........................... None
  exit_interval ................................... None
  exit_on_missing_checkpoint ...................... False
  exit_signal_handler ............................. False
  expert_interval ................................. 1
  expert_model_parallel_size ...................... 1
  ffn_hidden_size ................................. 4096
  fill_neg_inf .................................... False
  finetune ........................................ True
  first_k_dense_replace ........................... None
  first_pipeline_num_layers ....................... 0
  fp16 ............................................ False
  fp16_lm_cross_entropy ........................... False
  fp32_residual_connection ........................ False
  fp8 ............................................. None
  fp8_amax_compute_algo ........................... most_recent
  fp8_amax_history_len ............................ 1
  fp8_interval .................................... 1
  fp8_margin ...................................... 0
  fp8_wgrad ....................................... True
  full_shuffle_instruction_dataset ................ False
  gated_linear_unit ............................... False
  geglu ........................................... False
  gelu_tanh ....................................... False
  global_batch_size ............................... 64
  gradient_accumulation_fusion .................... False
  group_query_attention ........................... False
  head_lr_mult .................................... 1.0
  hidden_dropout .................................. 0.0
  hidden_size ..................................... 1024
  high_freq_factor ................................ None
  hysteresis ...................................... 2
  ict_head_size ................................... None
  ict_load ........................................ None
  image_size ...................................... 448
  image_token_length .............................. 256
  img_h ........................................... 448
  img_w ........................................... 448
  independent_parallel ............................ True
  indexer_batch_size .............................. 128
  indexer_log_interval ............................ 1000
  inference_batch_times_seqlen_threshold .......... 512
  init_method_std ................................. 0.01
  init_method_xavier_uniform ...................... False
  initial_loss_scale .............................. 4096.0
  input_embeds_norm ............................... False
  input_jitter .................................... True
  input_layernorm_in_fp32 ......................... False
  interleave_sliding_window ....................... None
  is_instruction_dataset .......................... True
  is_pairwise_dataset ............................. False
  iter_per_epoch .................................. 1250
  jit_compile ..................................... False
  kv_channels ..................................... 64
  kv_head_repeat_before_uly_alltoall .............. True
  kv_lora_rank .................................... None
  language_model_freeze ........................... False
  layernorm_zero_centered_gamma ................... False
  lazy_mpu_init ................................... None
  load ............................................ /
  load_checkpoint_loosely ......................... False
  local_rank ...................................... 0
  log_batch_size_to_tensorboard ................... False
  log_interval .................................... 1
  log_learning_rate_to_tensorboard ................ True
  log_loss_scale_to_tensorboard ................... True
  log_memory_to_tensorboard ....................... False
  log_num_zeros_in_grad ........................... False
  log_params_norm ................................. False
  log_progress .................................... False
  log_throughput .................................. True
  log_timers_to_tensorboard ....................... False
  log_validation_ppl_to_tensorboard ............... False
  log_world_size_to_tensorboard ................... False
  logit_mask ...................................... False
  lora_alpha ...................................... 32
  lora_fusion ..................................... False
  lora_load ....................................... None
  lora_modules_to_save ............................ None
  lora_r .......................................... 16
  lora_register_forward_hook ...................... ['word_embeddings', 'input_layernorm']
  lora_target_modules ............................. []
  loss_scale ...................................... None
  loss_scale_window ............................... 1000
  low_freq_factor ................................. None
  lr .............................................. 5e-06
  lr_decay_iters .................................. None
  lr_decay_samples ................................ None
  lr_decay_style .................................. cosine
  lr_warmup_fraction .............................. 0.03
  lr_warmup_init .................................. 0.0
  lr_warmup_iters ................................. 0
  lr_warmup_samples ............................... 0
  make_vocab_size_divisible_by .................... 1
  manual_gc ....................................... False
  manual_gc_eval .................................. True
  manual_gc_interval .............................. 0
  mask_factor ..................................... 1.0
  mask_prob ....................................... 0.15
  mask_type ....................................... random
  masked_softmax_fusion ........................... False
  max_fps ......................................... 1
  max_num_frame ................................... 512
  max_num_image ................................... 8
  max_patch_grid .................................. 12
  max_position_embeddings ......................... 1025
  max_tokens_to_oom ............................... 12000
  merge_file ...................................... None
  micro_batch_size ................................ 1
  min_loss_scale .................................. 1.0
  min_lr .......................................... 1e-07
  min_patch_grid .................................. 1
  mmap_bin_files .................................. True
  mock_data ....................................... False
  model_type ...................................... ModelType.encoder_or_decoder
  moe_allgather_overlap_comm ...................... False
  moe_alltoall_overlap_comm ....................... False
  moe_aux_loss_coeff .............................. 0.0
  moe_comm_aux_loss_coeff ......................... 0.0
  moe_device_level_aux_loss_coeff ................. 0.0
  moe_expert_capacity_factor ...................... None
  moe_grouped_gemm ................................ False
  moe_input_jitter_eps ............................ None
  moe_intermediate_size ........................... None
  moe_layer_freq .................................. None
  moe_pad_expert_input_to_capacity ................ False
  moe_per_layer_logging ........................... False
  moe_permutation_async_comm ...................... False
  moe_router_load_balancing_type .................. aux_loss
  moe_router_topk ................................. 2
  moe_token_dispatcher_type ....................... allgather
  moe_token_drop_policy ........................... probs
  moe_token_dropping .............................. False
  moe_tp_extend_ep ................................ False
  moe_train_capacity_factor ....................... 1.0
  moe_without_activation .......................... False
  moe_z_loss_coeff ................................ 0.0
  moe_zero_memory ................................. disable
  multi_head_latent_attention ..................... False
  n_shared_experts ................................ None
  nccl_communicator_config_path ................... None
  next_tockens .................................... 0
  no_load_optim ................................... True
  no_load_rng ..................................... True
  no_persist_layer_norm ........................... False
  no_post_layer_norm .............................. False
  no_save_optim ................................... None
  no_save_rng ..................................... None
  no_shared_storage ............................... False
  no_shuffle ...................................... False
  noisy_gate_policy ............................... None
  noop_layers ..................................... None
  norm_epsilon .................................... 1e-06
  norm_topk_prob .................................. False
  normalization ................................... LayerNorm
  num_attention_heads ............................. 16
  num_channels .................................... 3
  num_classes ..................................... 1000
  num_experts ..................................... None
  num_layer_list .................................. 
  num_layers ...................................... 24
  num_layers_per_virtual_pipeline_stage ........... None
  num_query_groups ................................ 16
  num_workers ..................................... 8
  one_logger_entity ............................... hwinf_dcm
  one_logger_project .............................. e2e-tracking
  one_logger_run_name ............................. None
  onnx_safe ....................................... None
  openai_gelu ..................................... False
  optimizer ....................................... adam
  original_max_position_embeddings ................ None
  output_bert_embeddings .......................... False
  output_layer_slice_num .......................... 1
  output_logit_softcapping ........................ None
  output_multiplier_scale ......................... None
  overlap_grad_reduce ............................. False
  overlap_p2p_comm ................................ False
  overlap_param_gather ............................ False
  override_opt_param_scheduler .................... False
  pad_to_multiple_of .............................. 8
  padded_vocab_size ............................... 152064
  params_dtype .................................... torch.bfloat16
  patch_dim ....................................... 14
  perform_initialization .......................... True
  pipeline_model_parallel_size .................... 1
  pipeline_model_parallel_split_rank .............. None
  position_embedding_type ......................... rope
  post_norm ....................................... False
  pre_tockens ..................................... 65536
  pref_ftx ........................................ 0.0
  pretrained_checkpoint ........................... None
  profile ......................................... False
  profile_level ................................... level0
  profile_ranks ................................... [-1]
  profile_record_shapes ........................... False
  profile_save_path ............................... ./profile_dir
  profile_step_end ................................ 12
  profile_step_start .............................. 10
  profile_with_cpu ................................ False
  profile_with_memory ............................. False
  profile_with_stack .............................. False
  prompt_format ................................... qwen2
  prompt_type ..................................... None
  q_lora_rank ..................................... None
  qk_layernorm .................................... False
  qk_nope_head_dim ................................ None
  qk_rope_head_dim ................................ None
  query_in_block_prob ............................. 0.1
  query_pre_attn_scalar ........................... None
  rampup_batch_size ............................... None
  rank ............................................ 0
  recompute_activation_function ................... False
  recompute_activation_function_num_layers ........ None
  recompute_granularity ........................... None
  recompute_in_advance ............................ False
  recompute_in_bubble ............................. False
  recompute_method ................................ None
  recompute_num_layers ............................ None
  reduce_recompute_for_last_chunk ................. False
  ref_model ....................................... None
  reset_attention_mask ............................ False
  reset_position_ids .............................. False
  retriever_report_topk_accuracies ................ []
  retriever_score_scaling ......................... False
  retriever_seq_length ............................ 256
  retro_add_retriever ............................. False
  retro_attention_gate ............................ 1
  retro_cyclic_train_iters ........................ None
  retro_encoder_attention_dropout ................. 0.1
  retro_encoder_hidden_dropout .................... 0.1
  retro_encoder_layers ............................ 2
  retro_num_neighbors ............................. 2
  retro_num_retrieved_chunks ...................... 2
  retro_project_dir ............................... None
  retro_verify_neighbor_count ..................... True
  reuse_fp32_param ................................ False
  rope_scaling_beta_fast .......................... 32
  rope_scaling_beta_slow .......................... 1
  rope_scaling_factor ............................. 1.0
  rope_scaling_mscale ............................. 1.0
  rope_scaling_mscale_all_dim ..................... 0.0
  rope_scaling_original_max_position_embeddings ... None
  rope_scaling_type ............................... None
  rotary_base ..................................... 1000000.0
  rotary_interleaved .............................. False
  rotary_percent .................................. 1.0
  rotary_seq_len_interpolation_factor ............. None
  routed_scaling_factor ........................... None
  sample_rate ..................................... 1.0
  save ............................................ /data_2/output/LM/scripts/modellink/qwen25/finetune_qwen25_14b_intern_300m_ptd_tp8pp1cp2_stage3.sh/20241127_204213//
  save_interval ................................... 100
  scale_depth ..................................... None
  scale_emb ....................................... None
  scatter_gather_tensors_in_pipeline .............. True
  seed ............................................ 424242
  seq_aux ......................................... False
  seq_length ...................................... 1025
  sequence_parallel ............................... False
  sgd_momentum .................................... 0.9
  shape_order ..................................... SBH
  shared_expert_gate .............................. False
  shared_expert_gate_output_dimension ............. 1
  short_seq_prob .................................. 0.1
  skip_bias_add ................................... True
  skip_train ...................................... False
  sliding_window .................................. None
  sparse_mode ..................................... 4
  spec ............................................ None
  split ........................................... 100,0,0
  square_alibi_mask ............................... False
  squared_relu .................................... False
  stage ........................................... None
  standalone_embedding_stage ...................... False
  start_weight_decay .............................. 0.0
  swap_attention .................................. False
  swap_modules .................................... None
  swiglu .......................................... False
  swin_backbone_type .............................. tiny
  tensor_model_parallel_size ...................... 8
  tensorboard_dir ................................. None
  tensorboard_log_interval ........................ 1
  tensorboard_queue_size .......................... 1000
  test_data_path .................................. None
  test_mode ....................................... False
  timing_log_level ................................ 0
  timing_log_option ............................... minmax
  titles_data_path ................................ None
  tokenizer_kwargs ................................ None
  tokenizer_model ................................. None
  tokenizer_name_or_path .......................... /data_4/models/Qwen/Qwen2.5-14B-Instruct/
  tokenizer_not_use_fast .......................... True
  tokenizer_padding_side .......................... right
  tokenizer_type .................................. PretrainedFromHF
  topk_group ...................................... None
  tp_2d ........................................... False
  tp_comm_bulk_dgrad .............................. True
  tp_comm_bulk_wgrad .............................. True
  tp_comm_overlap ................................. False
  tp_comm_overlap_ag .............................. True
  tp_comm_overlap_cfg ............................. None
  tp_comm_overlap_rs .............................. True
  tp_comm_split_ag ................................ True
  tp_comm_split_rs ................................ True
  tp_x ............................................ 1
  tp_y ............................................ 1
  train_data_path ................................. None
  train_iters ..................................... 1000
  train_samples ................................... None
  transformer_impl ................................ local
  transformer_pipeline_model_parallel_size ........ 1
  ulysses_degree_in_cp ............................ None
  untie_embeddings_and_output_weights ............. True
  use_checkpoint_args ............................. False
  use_checkpoint_opt_param_scheduler .............. False
  use_cp_send_recv_overlap ........................ True
  use_cpu_initialization .......................... None
  use_deter_comp .................................. False
  use_dist_ckpt ................................... False
  use_distributed_optimizer ....................... True
  use_flash_attn .................................. True
  use_fused_moe_token_permute_and_unpermute ....... False
  use_fused_ring_attention_update ................. False
  use_fused_rmsnorm ............................... True
  use_fused_rotary_pos_emb ........................ True
  use_fused_swiglu ................................ True
  use_glm_rope .................................... False
  use_mc2 ......................................... False
  use_mcore_models ................................ True
  use_one_sent_docs ............................... False
  use_ring_exchange_p2p ........................... False
  use_rotary_position_embeddings .................. True
  v_head_dim ...................................... None
  valid_data_path ................................. None
  variable_seq_lengths ............................ False
  virtual_pipeline_model_parallel_size ............ None
  vision_backbone_type ............................ vit
  vision_context_parallel ......................... False
  vision_downsample_ratio ......................... 0.5
  vision_downsample_stride ........................ 1.0
  vision_model_freeze ............................. True
  vision_model_lr_decay_rate ...................... 1.0
  vision_model_lr_mult ............................ 1.0
  vision_model_recompute .......................... False
  vision_model_type ............................... intern_300m
  vision_normalize_type ........................... imagenet
  vision_pretraining .............................. False
  vision_pretraining_type ......................... classify
  vision_process_type ............................. dynamic
  vision_projector_freeze ......................... False
  vision_projector_pre_norm ....................... True
  vision_projector_recompute ...................... False
  vision_projector_type ........................... mlp
  vision_seq_length ............................... 1025
  vit_load ........................................ /
  vocab_extra_ids ................................. 0
  vocab_file ...................................... None
  vocab_size ...................................... None
  wandb_exp_name .................................. 
  wandb_project ................................... 
  wandb_save_dir .................................. 
  weight_decay .................................... 0.0
  weight_decay_incr_style ......................... constant
  world_size ...................................... 512
  yaml_cfg ........................................ None
-------------------- end of vit_args ---------------------
MegatronVisionModel vision_model_config VisionTransformerConfig(tensor_model_parallel_size=8, pipeline_model_parallel_size=1, virtual_pipeline_model_parallel_size=None, sequence_parallel=False, context_parallel_size=1, expert_model_parallel_size=1, perform_initialization=True, use_cpu_initialization=None, fp16=False, bf16=True, params_dtype=torch.bfloat16, timers=None, finalize_model_grads_func=None, grad_scale_func=None, no_sync_func=None, grad_sync_func=None, param_sync_func=None, enable_autocast=False, autocast_dtype=torch.bfloat16, num_microbatches_with_partial_activation_checkpoints=None, gradient_accumulation_fusion=False, async_tensor_model_parallel_allreduce=False, tp_comm_overlap=False, tp_comm_bulk_wgrad=True, tp_comm_bulk_dgrad=True, tp_comm_overlap_ag=True, tp_comm_overlap_rs=True, tp_comm_split_ag=True, tp_comm_atomic_ag=False, tp_comm_split_rs=True, tp_comm_atomic_rs=False, pipeline_dtype=torch.bfloat16, variable_seq_lengths=False, overlap_p2p_comm=False, batch_p2p_comm=False, batch_p2p_sync=True, use_ring_exchange_p2p=False, deallocate_pipeline_outputs=True, defer_embedding_wgrad_compute=False, pipeline_model_parallel_split_rank=None, cpu_offloading=False, cpu_offloading_num_layers=0, _cpu_offloading_context=None, cpu_offloading_activations=True, cpu_offloading_weights=True, barrier_with_L1_time=True, num_layers=24, first_pipeline_num_layers=0, independent_parallel=True, hidden_size=1024, num_attention_heads=16, num_query_groups=16, ffn_hidden_size=4096, kv_channels=64, hidden_dropout=0.0, attention_dropout=0.0, fp32_residual_connection=False, apply_residual_connection_post_layernorm=False, layernorm_epsilon=1e-06, layernorm_zero_centered_gamma=False, add_bias_linear=True, add_qkv_bias=True, gated_linear_unit=False, activation_func=<built-in function gelu>, activation_func_fp8_input_store=False, num_moe_experts=None, rotary_interleaved=False, window_size=None, normalization='LayerNorm', qk_layernorm=False, test_mode=False, init_method=<function init_method_normal.<locals>.init_ at 0x7fb23001b3a0>, output_layer_init_method=<function scaled_init_method_normal.<locals>.init_ at 0x7fb23001be50>, init_method_std=0.01, apply_query_key_layer_scaling=False, attention_softmax_in_fp32=True, bias_activation_fusion=False, masked_softmax_fusion=False, persist_layer_norm=True, memory_efficient_layer_norm=False, bias_dropout_fusion=False, apply_rope_fusion=False, recompute_granularity=None, recompute_method=None, recompute_num_layers=None, distribute_saved_activations=False, fp8=None, fp8_margin=0, fp8_interval=1, fp8_amax_history_len=1, fp8_amax_compute_algo='most_recent', fp8_wgrad=True, fp8_dot_product_attention=False, fp8_multi_head_attention=False, moe_router_load_balancing_type='aux_loss', moe_router_topk=2, moe_grouped_gemm=False, moe_aux_loss_coeff=0.0, moe_z_loss_coeff=0.0, moe_input_jitter_eps=None, moe_token_dropping=False, moe_token_dispatcher_type='allgather', moe_per_layer_logging=False, moe_expert_capacity_factor=None, moe_pad_expert_input_to_capacity=False, moe_token_drop_policy='probs', moe_layer_recompute=False, clone_scatter_output_in_embedding=True, disable_parameter_transpose_cache=False, enable_cuda_graph=False, max_position_embeddings=1025, rotary_percent=1.0, img_w=448, img_h=448, patch_dim=14)
MegatronVisionModel vision_model_layer_spec ModuleSpec(module=<class 'lcvlm_modellink.core.models.vision.intern_vit_model.InternViTTransformerLayer'>, params={}, submodules=TransformerLayerSubmodules(input_layernorm=<class 'modellink.core.transformer.custom_layers.transformer_engine.PTNorm'>, self_attention=ModuleSpec(module=<class 'megatron.core.transformer.attention.SelfAttention'>, params={'attn_mask_type': <AttnMaskType.no_mask: 3>}, submodules=SelfAttentionSubmodules(linear_qkv=<class 'megatron.core.tensor_parallel.layers.ColumnParallelLinear'>, core_attention=<class 'megatron.core.transformer.dot_product_attention.DotProductAttention'>, linear_proj=<class 'megatron.core.tensor_parallel.layers.RowParallelLinear'>, q_layernorm=None, k_layernorm=None)), post_attn_norm=<class 'megatron.core.transformer.identity_op.IdentityOp'>, self_attn_bda=<class 'megatron.core.transformer.identity_op.IdentityFuncOp'>, pre_cross_attn_layernorm=<class 'megatron.core.transformer.identity_op.IdentityOp'>, cross_attention=<class 'megatron.core.transformer.identity_op.IdentityOp'>, cross_attn_bda=<class 'megatron.core.transformer.identity_op.IdentityFuncOp'>, pre_mlp_layernorm=<class 'modellink.core.transformer.custom_layers.transformer_engine.PTNorm'>, mlp=ModuleSpec(module=<class 'megatron.core.transformer.mlp.MLP'>, params={}, submodules=MLPSubmodules(linear_fc1=<class 'megatron.core.tensor_parallel.layers.ColumnParallelLinear'>, linear_fc2=<class 'megatron.core.tensor_parallel.layers.RowParallelLinear'>)), post_mlp_layernorm=<class 'megatron.core.transformer.identity_op.IdentityOp'>, mlp_bda=<class 'megatron.core.transformer.identity_op.IdentityFuncOp'>, sharded_state_dict_keys_map={}))
WARNING: could not find the metadata file /latest_checkpointed_iteration.txt 
    will not load any checkpoints and will start from random
vision_projector_config TransformerConfig(tensor_model_parallel_size=8, pipeline_model_parallel_size=1, virtual_pipeline_model_parallel_size=None, sequence_parallel=True, context_parallel_size=2, expert_model_parallel_size=1, perform_initialization=True, use_cpu_initialization=None, fp16=False, bf16=True, params_dtype=torch.bfloat16, timers=None, finalize_model_grads_func=None, grad_scale_func=None, no_sync_func=None, grad_sync_func=None, param_sync_func=None, enable_autocast=False, autocast_dtype=torch.bfloat16, num_microbatches_with_partial_activation_checkpoints=None, gradient_accumulation_fusion=False, async_tensor_model_parallel_allreduce=False, tp_comm_overlap=False, tp_comm_bulk_wgrad=True, tp_comm_bulk_dgrad=True, tp_comm_overlap_ag=True, tp_comm_overlap_rs=True, tp_comm_split_ag=True, tp_comm_atomic_ag=False, tp_comm_split_rs=True, tp_comm_atomic_rs=False, pipeline_dtype=torch.bfloat16, variable_seq_lengths=False, overlap_p2p_comm=False, batch_p2p_comm=False, batch_p2p_sync=True, use_ring_exchange_p2p=False, deallocate_pipeline_outputs=True, defer_embedding_wgrad_compute=False, pipeline_model_parallel_split_rank=None, cpu_offloading=False, cpu_offloading_num_layers=0, _cpu_offloading_context=None, cpu_offloading_activations=True, cpu_offloading_weights=True, barrier_with_L1_time=True, num_layers=48, first_pipeline_num_layers=0, independent_parallel=False, hidden_size=5120, num_attention_heads=40, num_query_groups=8, ffn_hidden_size=1024, kv_channels=128, hidden_dropout=0.0, attention_dropout=0.0, fp32_residual_connection=False, apply_residual_connection_post_layernorm=False, layernorm_epsilon=1e-06, layernorm_zero_centered_gamma=False, add_bias_linear=False, add_qkv_bias=True, gated_linear_unit=False, activation_func=<built-in function gelu>, activation_func_fp8_input_store=False, num_moe_experts=None, rotary_interleaved=False, window_size=None, normalization='RMSNorm', qk_layernorm=False, test_mode=False, init_method=<function init_method_normal.<locals>.init_ at 0x7fb23001bee0>, output_layer_init_method=<function scaled_init_method_normal.<locals>.init_ at 0x7fb23001bf70>, init_method_std=0.01, apply_query_key_layer_scaling=False, attention_softmax_in_fp32=True, bias_activation_fusion=False, masked_softmax_fusion=False, persist_layer_norm=True, memory_efficient_layer_norm=False, bias_dropout_fusion=True, apply_rope_fusion=True, recompute_granularity='full', recompute_method='block', recompute_num_layers=48, distribute_saved_activations=False, fp8=None, fp8_margin=0, fp8_interval=1, fp8_amax_history_len=1, fp8_amax_compute_algo='most_recent', fp8_wgrad=True, fp8_dot_product_attention=False, fp8_multi_head_attention=False, moe_router_load_balancing_type='aux_loss', moe_router_topk=2, moe_grouped_gemm=False, moe_aux_loss_coeff=0.0, moe_z_loss_coeff=0.0, moe_input_jitter_eps=None, moe_token_dropping=False, moe_token_dispatcher_type='allgather', moe_per_layer_logging=False, moe_expert_capacity_factor=None, moe_pad_expert_input_to_capacity=False, moe_token_drop_policy='probs', moe_layer_recompute=False, clone_scatter_output_in_embedding=True, disable_parameter_transpose_cache=False, enable_cuda_graph=False, max_position_embeddings=131072, rotary_percent=1.0)
vision_projector_layer_spec MLPSubmodules(linear_fc1=<class 'megatron.core.tensor_parallel.layers.ColumnParallelLinear'>, linear_fc2=<class 'megatron.core.tensor_parallel.layers.RowParallelLinear'>)
vision_model_freeze
=> set param external_feature_model.vit.class_token torch.Size([1, 1, 1024]) requires grad to False.
=> set param external_feature_model.vit.conv1.weight torch.Size([1024, 3, 14, 14]) requires grad to False.
=> set param external_feature_model.vit.conv1.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.position_embeddings.weight torch.Size([1025, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.0.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.0.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.0.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.0.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.0.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.0.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.2.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.2.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.2.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.2.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.2.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.2.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.3.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.3.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.3.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.3.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.3.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.3.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.6.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.6.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.6.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.6.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.6.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.6.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
vision_model_freeze=> set param external_feature_model.vit.decoder.layers.10.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.10.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.class_token torch.Size([1, 1, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.ls1 torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.12.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.12.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.conv1.weight torch.Size([1024, 3, 14, 14]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.input_layernorm.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.conv1.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.position_embeddings.weight torch.Size([1025, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.0.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.0.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.12.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.0.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.0.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.ls1 torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.13.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.13.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.13.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.0.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.0.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.13.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.14.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.ls1 torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.14.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.ls2 torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.14.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.14.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.14.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.14.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.15.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.15.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.15.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.2.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.15.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.2.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.2.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.input_layernorm.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.15.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.15.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.2.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.16.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.input_layernorm.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.3.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.ls2 torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.3.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.3.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.16.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.16.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.3.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.3.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.17.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.17.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.17.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.17.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.17.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.17.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.ls1 torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.5.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.ls2 torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.19.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.input_layernorm.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.6.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.6.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.input_layernorm.weight torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.19.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.input_layernorm.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.6.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.input_layernorm.weight torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.6.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.input_layernorm.bias torch.Size([1024]) requires grad to False.

vision_model_freeze=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.21.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.21.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.input_layernorm.weight torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.input_layernorm.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.7.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.class_token torch.Size([1, 1, 1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.7.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.conv1.weight torch.Size([1024, 3, 14, 14]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.conv1.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.position_embeddings.weight torch.Size([1025, 1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.8.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.0.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.21.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.0.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.0.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.vision_model_freeze=> set param external_feature_model.vit.decoder.layers.0.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.22.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.input_layernorm.weight torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.22.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.0.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.9.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.class_token torch.Size([1, 1, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.input_layernorm.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.22.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.conv1.weight torch.Size([1024, 3, 14, 14]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.conv1.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.position_embeddings.weight torch.Size([1025, 1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.1.ls2 torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.23.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.23.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.input_layernorm.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.23.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.ls1 torch.Size([1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.23.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.0.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.0.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.10.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.23.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.10.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.23.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.10.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.10.input_layernorm.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.0.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.0.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.2.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.2.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.10.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.2.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.10.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.2.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.1.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.ls2 torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.1.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.input_layernorm.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.11.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.2.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.2.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.ls1 torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.1.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.ls2 torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.11.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.3.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.3.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.2.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.2.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.12.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.2.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.input_layernorm.weight torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.3.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.2.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.3.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.4.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.ls2 torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.2.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.2.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.input_layernorm.weight torch.Size([1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.4.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.3.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.13.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.3.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.13.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.input_layernorm.weight torch.Size([1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.4.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.input_layernorm.bias torch.Size([1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.5.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.13.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.13.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.3.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.3.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.input_layernorm.weight torch.Size([1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.input_layernorm.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.ls2 torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.14.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.14.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.6.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.6.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.6.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.4.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.4.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.15.ls1 torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.15.ls2 torch.Size([1024]) requires grad to False.
vision_model_freeze

=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.15.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.15.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.6.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.6.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.5.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
vision_model_freeze


=> set param external_feature_model.vit.decoder.layers.7.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.7.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.7.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.class_token torch.Size([1, 1, 1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.7.input_layernorm.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.5.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.conv1.weight torch.Size([1024, 3, 14, 14]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.

=> set param external_feature_model.vit.conv1.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.ls2 torch.Size([1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.16.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.position_embeddings.weight torch.Size([1025, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.6.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.0.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.0.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.input_layernorm.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.class_token torch.Size([1, 1, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.0.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.conv1.weight torch.Size([1024, 3, 14, 14]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.conv1.bias torch.Size([1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.8.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.6.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.position_embeddings.weight torch.Size([1025, 1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.8.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.6.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.8.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.ls2 torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.0.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.17.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.0.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.7.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.7.ls2 torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.input_layernorm.weight torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.8.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.ls1 torch.Size([1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.7.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.0.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.ls1 torch.Size([1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.0.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.ls2 torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.input_layernorm.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.input_layernorm.bias torch.Size([1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.8.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.8.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.1.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.input_layernorm.weight torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.1.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.ls1 torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.9.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.1.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.input_layernorm.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.ls1 torch.Size([1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.8.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.ls2 torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.2.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.10.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.2.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.10.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.10.input_layernorm.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.9.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.1.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.9.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.2.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.10.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.2.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.9.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.ls2 torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.input_layernorm.weight torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.3.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.20.ls2 torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.2.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.input_layernorm.weight torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.11.ls1 torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.ls2 torch.Size([1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.11.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.input_layernorm.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.ls1 torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.10.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.10.input_layernorm.weight torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.2.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.3.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.3.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.11.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.vision_model_freeze=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.11.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.21.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.10.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.21.ls2 torch.Size([1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.10.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.3.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.ls1 torch.Size([1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.21.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.ls2 torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.12.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.ls2 torch.Size([1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.3.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.input_layernorm.bias torch.Size([1024]) requires grad to False.


=> set param external_feature_model.vit.class_token torch.Size([1, 1, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.ls1 torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.12.input_layernorm.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.conv1.weight torch.Size([1024, 3, 14, 14]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.ls2 torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.conv1.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.


=> set param external_feature_model.vit.position_embeddings.weight torch.Size([1025, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.21.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.3.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.0.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.12.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.0.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.12.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.0.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.11.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.11.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.5.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.22.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.ls1 torch.Size([1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.13.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.13.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.input_layernorm.weight torch.Size([1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.12.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.input_layernorm.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.4.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.12.ls2 torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.0.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.12.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.12.input_layernorm.bias torch.Size([1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.5.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.13.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.23.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.4.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.23.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.1.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.12.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.input_layernorm.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.1.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.ls1 torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.6.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.13.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.14.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.input_layernorm.bias torch.Size([1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.14.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.13.input_layernorm.weight torch.Size([1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.23.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.input_layernorm.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.5.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.5.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.1.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.6.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.6.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.13.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.5.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.2.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.ls1 torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.2.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.2.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.ls1 torch.Size([1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.7.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.ls2 torch.Size([1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.7.input_layernorm.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.14.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.input_layernorm.bias torch.Size([1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.ls2 torch.Size([1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.6.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.6.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.2.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.input_layernorm.weight torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.2.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.7.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.15.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.15.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.3.ls1 torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.14.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.ls2 torch.Size([1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.8.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.16.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.16.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.input_layernorm.bias torch.Size([1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.8.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.15.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.15.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.input_layernorm.weight torch.Size([1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.input_layernorm.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.3.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.7.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.16.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.7.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.8.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.8.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.7.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.7.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.15.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.4.ls1 torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.4.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.17.ls2 torch.Size([1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.9.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.input_layernorm.weight torch.Size([1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.4.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.17.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.7.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.9.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.input_layernorm.weight torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.input_layernorm.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.4.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.8.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.17.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.8.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.17.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.8.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.5.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.18.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.17.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.ls2 torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.10.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.17.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.input_layernorm.weight torch.Size([1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.10.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.17.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.8.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.10.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.17.input_layernorm.bias torch.Size([1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.8.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.5.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.18.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.18.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.input_layernorm.weight torch.Size([1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.10.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.input_layernorm.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.17.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.17.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.6.ls2 torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.11.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.6.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.ls2 torch.Size([1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.9.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.ls2 torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.11.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.input_layernorm.weight torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.9.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.input_layernorm.weight torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.input_layernorm.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.19.input_layernorm.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.6.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.6.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.10.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.18.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.10.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.11.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.10.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.19.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.12.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.12.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.input_layernorm.weight torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.20.ls2 torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.19.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.7.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.input_layernorm.weight torch.Size([1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.19.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.12.input_layernorm.bias torch.Size([1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.10.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.10.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.19.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.7.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.7.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.12.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.

vision_model_freeze
=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.11.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.20.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.input_layernorm.weight torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.8.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.ls1 torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.20.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.8.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.21.ls2 torch.Size([1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.20.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.ls2 torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.8.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.input_layernorm.weight torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.13.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.8.input_layernorm.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.13.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.class_token torch.Size([1, 1, 1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.


=> set param external_feature_model.vit.conv1.weight torch.Size([1024, 3, 14, 14]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.11.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.conv1.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.13.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.position_embeddings.weight torch.Size([1025, 1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.0.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.vision_model_freeze=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.21.ls1 torch.Size([1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.9.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.21.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.ls1 torch.Size([1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.21.input_layernorm.weight torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.12.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.0.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.input_layernorm.weight torch.Size([1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.21.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.12.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.ls1 torch.Size([1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.input_layernorm.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.22.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.22.input_layernorm.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.


=> set param external_feature_model.vit.class_token torch.Size([1, 1, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.conv1.weight torch.Size([1024, 3, 14, 14]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.21.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.conv1.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.14.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.9.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.position_embeddings.weight torch.Size([1025, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.12.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.9.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.0.ls1 torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.0.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.0.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.22.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.ls1 torch.Size([1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.13.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.ls2 torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.10.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.23.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.ls2 torch.Size([1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.22.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.22.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.10.ls2 torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.15.input_layernorm.weight torch.Size([1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.13.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.23.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.23.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.input_layernorm.bias torch.Size([1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.10.input_layernorm.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.22.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.23.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.0.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.13.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.10.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.13.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.23.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.23.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.23.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.ls1 torch.Size([1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.16.ls2 torch.Size([1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.23.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.2.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.ls2 torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.16.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.input_layernorm.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.2.input_layernorm.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.11.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.input_layernorm.weight torch.Size([1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.14.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.2.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.2.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.1.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.14.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.14.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.17.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.17.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.3.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.2.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.input_layernorm.weight torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.3.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
model GPTVLModel(
  (external_feature_model): MegatronVisionModel(
    (vit): InternViTModel(
      (conv1): Conv2d(3, 1024, kernel_size=(14, 14), stride=(14, 14))
      (position_embeddings): Embedding(1025, 1024)
      (decoder): TransformerBlock(
        (layers): ModuleList(
          (0-23): 24 x InternViTTransformerLayer(
            (input_layernorm): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)
            (self_attention): SelfAttention(
              (core_attention): DotProductAttention(
                (scale_mask_softmax): FusedScaleMaskSoftmax()
                (attention_dropout): Dropout(p=0.0, inplace=False)
              )
              (linear_proj): RowParallelLinear()
              (linear_qkv): ColumnParallelLinear()
            )
            (self_attn_bda): IdentityFuncOp()
            (pre_cross_attn_layernorm): IdentityOp()
            (cross_attention): IdentityOp()
            (cross_attn_bda): IdentityFuncOp()
            (pre_mlp_layernorm): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)
            (mlp): MLP(
              (linear_fc1): ColumnParallelLinear()
              (linear_fc2): RowParallelLinear()
            )
            (mlp_bda): IdentityFuncOp()
          )
        )
      )
    )
    (vision_projection): MultimodalProjector(
      (encoder): MLP(
        (linear_fc1): ColumnParallelLinear()
        (linear_fc2): RowParallelLinear()
      )
    )
    (pre_proj_layernorm): LayerNorm((4096,), eps=1e-05, elementwise_affine=True)
  )
  (embedding): LanguageModelEmbedding(
    (word_embeddings): VocabParallelEmbedding()
    (embedding_dropout): Dropout(p=0.0, inplace=False)
  )
  (rotary_pos_emb): RotaryEmbedding()
  (decoder): TransformerBlock(
    (layers): ModuleList(
      (0-47): 48 x TransformerLayer(
        (input_layernorm): RMSNorm()
        (self_attention): SelfAttention(
          (core_attention): DotProductAttention(
            (scale_mask_softmax): FusedScaleMaskSoftmax()
            (attention_dropout): Dropout(p=0.0, inplace=False)
          )
          (linear_proj): RowParallelLinear()
          (linear_qkv): ColumnParallelLinear()
          (q_layernorm): IdentityOp()
          (k_layernorm): IdentityOp()
        )
        (pre_cross_attn_layernorm): IdentityOp()
        (cross_attention): IdentityOp()
        (cross_attn_bda): IdentityFuncOp()
        (pre_mlp_layernorm): RMSNorm()
        (mlp): MLP(
          (linear_fc1): ColumnParallelLinear()
          (linear_fc2): RowParallelLinear()
        )
      )
    )
    (final_layernorm): RMSNorm()
  )
  (output_layer): ColumnParallelLinear()
)=> set param external_feature_model.vit.decoder.layers.2.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.3.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.input_layernorm.bias torch.Size([1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.3.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.2.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.ls1 torch.Size([1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.15.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.ls2 torch.Size([1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.15.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.15.input_layernorm.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.3.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.2.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.17.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.17.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.3.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.18.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.3.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.13.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.3.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.input_layernorm.weight torch.Size([1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.13.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.input_layernorm.bias torch.Size([1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.input_layernorm.weight torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.13.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.3.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.ls1 torch.Size([1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.ls2 torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.ls2 torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.14.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.14.ls2 torch.Size([1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.19.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.input_layernorm.weight torch.Size([1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.14.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.ls1 torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.4.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.14.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.5.input_layernorm.weight torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.5.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.19.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.5.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.5.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.17.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.15.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.6.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.15.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.6.ls2 torch.Size([1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.input_layernorm.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.ls2 torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.15.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.6.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.6.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.21.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.21.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.21.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.7.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.ls2 torch.Size([1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.7.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.6.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.input_layernorm.weight torch.Size([1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.18.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.input_layernorm.weight torch.Size([1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.7.input_layernorm.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.6.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.ls1 torch.Size([1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.19.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.7.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.16.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.6.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.input_layernorm.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.21.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.8.ls1 torch.Size([1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.7.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.7.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.ls1 torch.Size([1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.7.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.17.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.8.input_layernorm.weight torch.Size([1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.7.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.8.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.22.input_layernorm.weight torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.17.input_layernorm.weight torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.19.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.22.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.input_layernorm.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.ls1 torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.8.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.input_layernorm.weight torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.22.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.17.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.8.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.ls1 torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.8.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.ls2 torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.23.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.input_layernorm.weight torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.8.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.23.ls2 torch.Size([1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.23.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.23.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.18.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.18.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.23.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.8.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.21.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.ls2 torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.21.input_layernorm.weight torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.21.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.ls1 torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.ls1 torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.10.input_layernorm.weight torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.9.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.10.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.19.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.19.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.input_layernorm.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.21.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.21.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.9.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.10.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.19.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.22.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.10.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.ls2 torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.11.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.10.ls2 torch.Size([1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.22.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.10.input_layernorm.weight torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.22.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.10.input_layernorm.bias torch.Size([1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.20.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.input_layernorm.weight torch.Size([1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.input_layernorm.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.10.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.11.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.11.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.ls1 torch.Size([1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.12.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.23.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.12.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.ls1 torch.Size([1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.11.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.input_layernorm.weight torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.21.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.23.input_layernorm.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.21.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.21.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.23.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.23.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.21.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.21.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.12.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.12.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.13.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.12.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.13.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.13.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.22.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.22.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.22.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.22.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.12.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.12.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.13.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.13.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.13.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.14.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.13.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.13.input_layernorm.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.14.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.14.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.23.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.input_layernorm.weight torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.23.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.13.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.13.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.14.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.14.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.23.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.ls1 torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.14.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.15.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.input_layernorm.weight torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.14.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.15.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.15.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.14.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.15.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.15.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.15.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.ls2 torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.15.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.15.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.16.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.vision_model_freeze=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.15.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.ls2 torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.input_layernorm.weight torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.input_layernorm.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.17.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.17.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.model GPTVLModel(
  (external_feature_model): MegatronVisionModel(
    (vit): InternViTModel(
      (conv1): Conv2d(3, 1024, kernel_size=(14, 14), stride=(14, 14))
      (position_embeddings): Embedding(1025, 1024)
      (decoder): TransformerBlock(
        (layers): ModuleList(
          (0-23): 24 x InternViTTransformerLayer(
            (input_layernorm): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)
            (self_attention): SelfAttention(
              (core_attention): DotProductAttention(
                (scale_mask_softmax): FusedScaleMaskSoftmax()
                (attention_dropout): Dropout(p=0.0, inplace=False)
              )
              (linear_proj): RowParallelLinear()
              (linear_qkv): ColumnParallelLinear()
            )
            (self_attn_bda): IdentityFuncOp()
            (pre_cross_attn_layernorm): IdentityOp()
            (cross_attention): IdentityOp()
            (cross_attn_bda): IdentityFuncOp()
            (pre_mlp_layernorm): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)
            (mlp): MLP(
              (linear_fc1): ColumnParallelLinear()
              (linear_fc2): RowParallelLinear()
            )
            (mlp_bda): IdentityFuncOp()
          )
        )
      )
    )
    (vision_projection): MultimodalProjector(
      (encoder): MLP(
        (linear_fc1): ColumnParallelLinear()
        (linear_fc2): RowParallelLinear()
      )
    )
    (pre_proj_layernorm): LayerNorm((4096,), eps=1e-05, elementwise_affine=True)
  )
  (embedding): LanguageModelEmbedding(
    (word_embeddings): VocabParallelEmbedding()
    (embedding_dropout): Dropout(p=0.0, inplace=False)
  )
  (rotary_pos_emb): RotaryEmbedding()
  (decoder): TransformerBlock(
    (layers): ModuleList(
      (0-47): 48 x TransformerLayer(
        (input_layernorm): RMSNorm()
        (self_attention): SelfAttention(
          (core_attention): DotProductAttention(
            (scale_mask_softmax): FusedScaleMaskSoftmax()
            (attention_dropout): Dropout(p=0.0, inplace=False)
          )
          (linear_proj): RowParallelLinear()
          (linear_qkv): ColumnParallelLinear()
          (q_layernorm): IdentityOp()
          (k_layernorm): IdentityOp()
        )
        (pre_cross_attn_layernorm): IdentityOp()
        (cross_attention): IdentityOp()
        (cross_attn_bda): IdentityFuncOp()
        (pre_mlp_layernorm): RMSNorm()
        (mlp): MLP(
          (linear_fc1): ColumnParallelLinear()
          (linear_fc2): RowParallelLinear()
        )
      )
    )
    (final_layernorm): RMSNorm()
  )
  (output_layer): ColumnParallelLinear()
)


=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.17.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.class_token torch.Size([1, 1, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.conv1.weight torch.Size([1024, 3, 14, 14]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.conv1.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.position_embeddings.weight torch.Size([1025, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.ls1 torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.0.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.17.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.17.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.0.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.17.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.0.input_layernorm.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.17.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.ls1 torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.0.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.ls1 torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.ls2 torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.input_layernorm.weight torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.18.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.input_layernorm.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.19.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.input_layernorm.bias torch.Size([1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.18.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.ls1 torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.19.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.input_layernorm.weight torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.19.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.2.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.ls1 torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.2.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.ls2 torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.input_layernorm.weight torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.2.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.2.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.ls1 torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.2.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.ls2 torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.2.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.21.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.21.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.21.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.3.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.3.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.3.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.3.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.ls1 torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.21.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.21.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.3.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.input_layernorm.bias torch.Size([1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.3.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.4.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.22.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.22.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.22.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.input_layernorm.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.4.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.21.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.4.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.22.ls1 torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.22.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.22.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.22.input_layernorm.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.5.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.ls2 torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.23.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.input_layernorm.weight torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.5.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.22.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.22.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.23.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.23.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.ls2 torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.23.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.23.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.6.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.6.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.6.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.input_layernorm.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.23.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.23.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.6.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
vision_model_freeze=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.11.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.class_token torch.Size([1, 1, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.conv1.weight torch.Size([1024, 3, 14, 14]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.conv1.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.12.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.12.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.position_embeddings.weight torch.Size([1025, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.12.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.12.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.ls1 torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.0.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.input_layernorm.weight torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.0.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.12.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.12.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.13.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.13.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.0.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.13.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.0.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.13.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.13.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.1.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.14.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.14.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.14.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.14.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.2.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.2.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.14.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.2.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.input_layernorm.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.15.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.ls2 torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.15.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.15.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
model GPTVLModel(
  (external_feature_model): MegatronVisionModel(
    (vit): InternViTModel(
      (conv1): Conv2d(3, 1024, kernel_size=(14, 14), stride=(14, 14))
      (position_embeddings): Embedding(1025, 1024)
      (decoder): TransformerBlock(
        (layers): ModuleList(
          (0-23): 24 x InternViTTransformerLayer(
            (input_layernorm): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)
            (self_attention): SelfAttention(
              (core_attention): DotProductAttention(
                (scale_mask_softmax): FusedScaleMaskSoftmax()
                (attention_dropout): Dropout(p=0.0, inplace=False)
              )
              (linear_proj): RowParallelLinear()
              (linear_qkv): ColumnParallelLinear()
            )
            (self_attn_bda): IdentityFuncOp()
            (pre_cross_attn_layernorm): IdentityOp()
            (cross_attention): IdentityOp()
            (cross_attn_bda): IdentityFuncOp()
            (pre_mlp_layernorm): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)
            (mlp): MLP(
              (linear_fc1): ColumnParallelLinear()
              (linear_fc2): RowParallelLinear()
            )
            (mlp_bda): IdentityFuncOp()
          )
        )
      )
    )
    (vision_projection): MultimodalProjector(
      (encoder): MLP(
        (linear_fc1): ColumnParallelLinear()
        (linear_fc2): RowParallelLinear()
      )
    )
    (pre_proj_layernorm): LayerNorm((4096,), eps=1e-05, elementwise_affine=True)
  )
  (embedding): LanguageModelEmbedding(
    (word_embeddings): VocabParallelEmbedding()
    (embedding_dropout): Dropout(p=0.0, inplace=False)
  )
  (rotary_pos_emb): RotaryEmbedding()
  (decoder): TransformerBlock(
    (layers): ModuleList(
      (0-47): 48 x TransformerLayer(
        (input_layernorm): RMSNorm()
        (self_attention): SelfAttention(
          (core_attention): DotProductAttention(
            (scale_mask_softmax): FusedScaleMaskSoftmax()
            (attention_dropout): Dropout(p=0.0, inplace=False)
          )
          (linear_proj): RowParallelLinear()
          (linear_qkv): ColumnParallelLinear()
          (q_layernorm): IdentityOp()
          (k_layernorm): IdentityOp()
        )
        (pre_cross_attn_layernorm): IdentityOp()
        (cross_attention): IdentityOp()
        (cross_attn_bda): IdentityFuncOp()
        (pre_mlp_layernorm): RMSNorm()
        (mlp): MLP(
          (linear_fc1): ColumnParallelLinear()
          (linear_fc2): RowParallelLinear()
        )
      )
    )
    (final_layernorm): RMSNorm()
  )
  (output_layer): ColumnParallelLinear()
)=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.15.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.3.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.ls2 torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.input_layernorm.weight torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.3.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.16.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.3.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.3.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.ls2 torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.input_layernorm.weight torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.4.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.17.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.ls2 torch.Size([1024]) requires grad to False.
model GPTVLModel(
  (external_feature_model): MegatronVisionModel(
    (vit): InternViTModel(
      (conv1): Conv2d(3, 1024, kernel_size=(14, 14), stride=(14, 14))
      (position_embeddings): Embedding(1025, 1024)
      (decoder): TransformerBlock(
        (layers): ModuleList(
          (0-23): 24 x InternViTTransformerLayer(
            (input_layernorm): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)
            (self_attention): SelfAttention(
              (core_attention): DotProductAttention(
                (scale_mask_softmax): FusedScaleMaskSoftmax()
                (attention_dropout): Dropout(p=0.0, inplace=False)
              )
              (linear_proj): RowParallelLinear()
              (linear_qkv): ColumnParallelLinear()
            )
            (self_attn_bda): IdentityFuncOp()
            (pre_cross_attn_layernorm): IdentityOp()
            (cross_attention): IdentityOp()
            (cross_attn_bda): IdentityFuncOp()
            (pre_mlp_layernorm): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)
            (mlp): MLP(
              (linear_fc1): ColumnParallelLinear()
              (linear_fc2): RowParallelLinear()
            )
            (mlp_bda): IdentityFuncOp()
          )
        )
      )
    )
    (vision_projection): MultimodalProjector(
      (encoder): MLP(
        (linear_fc1): ColumnParallelLinear()
        (linear_fc2): RowParallelLinear()
      )
    )
    (pre_proj_layernorm): LayerNorm((4096,), eps=1e-05, elementwise_affine=True)
  )
  (embedding): LanguageModelEmbedding(
    (word_embeddings): VocabParallelEmbedding()
    (embedding_dropout): Dropout(p=0.0, inplace=False)
  )
  (rotary_pos_emb): RotaryEmbedding()
  (decoder): TransformerBlock(
    (layers): ModuleList(
      (0-47): 48 x TransformerLayer(
        (input_layernorm): RMSNorm()
        (self_attention): SelfAttention(
          (core_attention): DotProductAttention(
            (scale_mask_softmax): FusedScaleMaskSoftmax()
            (attention_dropout): Dropout(p=0.0, inplace=False)
          )
          (linear_proj): RowParallelLinear()
          (linear_qkv): ColumnParallelLinear()
          (q_layernorm): IdentityOp()
          (k_layernorm): IdentityOp()
        )
        (pre_cross_attn_layernorm): IdentityOp()
        (cross_attention): IdentityOp()
        (cross_attn_bda): IdentityFuncOp()
        (pre_mlp_layernorm): RMSNorm()
        (mlp): MLP(
          (linear_fc1): ColumnParallelLinear()
          (linear_fc2): RowParallelLinear()
        )
      )
    )
    (final_layernorm): RMSNorm()
  )
  (output_layer): ColumnParallelLinear()
)
=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.17.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.17.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.17.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.input_layernorm.weight torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.17.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.ls1 torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.18.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.18.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.6.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.6.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.18.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.6.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.6.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.6.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.6.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.7.input_layernorm.weight torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.7.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.ls1 torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.20.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.20.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.20.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.7.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.20.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.8.ls2 torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.8.input_layernorm.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.21.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.21.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.input_layernorm.weight torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.21.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.8.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.21.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.21.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.22.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.ls2 torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.22.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.22.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.10.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.22.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.23.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.23.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.23.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.23.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.input_layernorm.weight torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.input_layernorm.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.23.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.23.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.12.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.12.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.12.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.12.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.12.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.12.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.13.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.13.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.13.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.13.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.13.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.13.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.14.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.14.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.14.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.14.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.14.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.14.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. > number of parameters on (tensor, pipeline) model parallel rank (6, 0): 1887497216

=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.15.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.15.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.15.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.15.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
model GPTVLModel(
  (external_feature_model): MegatronVisionModel(
    (vit): InternViTModel(
      (conv1): Conv2d(3, 1024, kernel_size=(14, 14), stride=(14, 14))
      (position_embeddings): Embedding(1025, 1024)
      (decoder): TransformerBlock(
        (layers): ModuleList(
          (0-23): 24 x InternViTTransformerLayer(
            (input_layernorm): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)
            (self_attention): SelfAttention(
              (core_attention): DotProductAttention(
                (scale_mask_softmax): FusedScaleMaskSoftmax()
                (attention_dropout): Dropout(p=0.0, inplace=False)
              )
              (linear_proj): RowParallelLinear()
              (linear_qkv): ColumnParallelLinear()
            )
            (self_attn_bda): IdentityFuncOp()
            (pre_cross_attn_layernorm): IdentityOp()
            (cross_attention): IdentityOp()
            (cross_attn_bda): IdentityFuncOp()
            (pre_mlp_layernorm): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)
            (mlp): MLP(
              (linear_fc1): ColumnParallelLinear()
              (linear_fc2): RowParallelLinear()
            )
            (mlp_bda): IdentityFuncOp()
          )
        )
      )
    )
    (vision_projection): MultimodalProjector(
      (encoder): MLP(
        (linear_fc1): ColumnParallelLinear()
        (linear_fc2): RowParallelLinear()
      )
    )
    (pre_proj_layernorm): LayerNorm((4096,), eps=1e-05, elementwise_affine=True)
  )
  (embedding): LanguageModelEmbedding(
    (word_embeddings): VocabParallelEmbedding()
    (embedding_dropout): Dropout(p=0.0, inplace=False)
  )
  (rotary_pos_emb): RotaryEmbedding()
  (decoder): TransformerBlock(
    (layers): ModuleList(
      (0-47): 48 x TransformerLayer(
        (input_layernorm): RMSNorm()
        (self_attention): SelfAttention(
          (core_attention): DotProductAttention(
            (scale_mask_softmax): FusedScaleMaskSoftmax()
            (attention_dropout): Dropout(p=0.0, inplace=False)
          )
          (linear_proj): RowParallelLinear()
          (linear_qkv): ColumnParallelLinear()
          (q_layernorm): IdentityOp()
          (k_layernorm): IdentityOp()
        )
        (pre_cross_attn_layernorm): IdentityOp()
        (cross_attention): IdentityOp()
        (cross_attn_bda): IdentityFuncOp()
        (pre_mlp_layernorm): RMSNorm()
        (mlp): MLP(
          (linear_fc1): ColumnParallelLinear()
          (linear_fc2): RowParallelLinear()
        )
      )
    )
    (final_layernorm): RMSNorm()
  )
  (output_layer): ColumnParallelLinear()
)=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.15.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.vision_model_freeze

=> set param external_feature_model.vit.decoder.layers.15.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.class_token torch.Size([1, 1, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.conv1.weight torch.Size([1024, 3, 14, 14]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.conv1.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.position_embeddings.weight torch.Size([1025, 1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.17.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.17.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.0.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.17.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.0.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.17.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.0.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.0.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.17.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.17.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.0.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.input_layernorm.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.1.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
model GPTVLModel(
  (external_feature_model): MegatronVisionModel(
    (vit): InternViTModel(
      (conv1): Conv2d(3, 1024, kernel_size=(14, 14), stride=(14, 14))
      (position_embeddings): Embedding(1025, 1024)
      (decoder): TransformerBlock(
        (layers): ModuleList(
          (0-23): 24 x InternViTTransformerLayer(
            (input_layernorm): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)
            (self_attention): SelfAttention(
              (core_attention): DotProductAttention(
                (scale_mask_softmax): FusedScaleMaskSoftmax()
                (attention_dropout): Dropout(p=0.0, inplace=False)
              )
              (linear_proj): RowParallelLinear()
              (linear_qkv): ColumnParallelLinear()
            )
            (self_attn_bda): IdentityFuncOp()
            (pre_cross_attn_layernorm): IdentityOp()
            (cross_attention): IdentityOp()
            (cross_attn_bda): IdentityFuncOp()
            (pre_mlp_layernorm): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)
            (mlp): MLP(
              (linear_fc1): ColumnParallelLinear()
              (linear_fc2): RowParallelLinear()
            )
            (mlp_bda): IdentityFuncOp()
          )
        )
      )
    )
    (vision_projection): MultimodalProjector(
      (encoder): MLP(
        (linear_fc1): ColumnParallelLinear()
        (linear_fc2): RowParallelLinear()
      )
    )
    (pre_proj_layernorm): LayerNorm((4096,), eps=1e-05, elementwise_affine=True)
  )
  (embedding): LanguageModelEmbedding(
    (word_embeddings): VocabParallelEmbedding()
    (embedding_dropout): Dropout(p=0.0, inplace=False)
  )
  (rotary_pos_emb): RotaryEmbedding()
  (decoder): TransformerBlock(
    (layers): ModuleList(
      (0-47): 48 x TransformerLayer(
        (input_layernorm): RMSNorm()
        (self_attention): SelfAttention(
          (core_attention): DotProductAttention(
            (scale_mask_softmax): FusedScaleMaskSoftmax()
            (attention_dropout): Dropout(p=0.0, inplace=False)
          )
          (linear_proj): RowParallelLinear()
          (linear_qkv): ColumnParallelLinear()
          (q_layernorm): IdentityOp()
          (k_layernorm): IdentityOp()
        )
        (pre_cross_attn_layernorm): IdentityOp()
        (cross_attention): IdentityOp()
        (cross_attn_bda): IdentityFuncOp()
        (pre_mlp_layernorm): RMSNorm()
        (mlp): MLP(
          (linear_fc1): ColumnParallelLinear()
          (linear_fc2): RowParallelLinear()
        )
      )
    )
    (final_layernorm): RMSNorm()
  )
  (output_layer): ColumnParallelLinear()
)=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.ls1 torch.Size([1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.ls2 torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.input_layernorm.weight torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.input_layernorm.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.2.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.2.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.2.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.2.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.19.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.19.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.2.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.20.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.input_layernorm.weight torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.input_layernorm.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.3.ls1 torch.Size([1024]) requires grad to False.
vision_model_freeze=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.3.ls2 torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.3.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.3.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.21.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.3.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.21.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.3.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.21.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.21.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.class_token torch.Size([1, 1, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.conv1.weight torch.Size([1024, 3, 14, 14]) requires grad to False.
=> set param external_feature_model.vit.conv1.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.21.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.position_embeddings.weight torch.Size([1025, 1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.21.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.4.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.ls1 torch.Size([1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.0.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.22.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.22.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.22.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.22.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.0.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.input_layernorm.weight torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.5.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.22.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.ls1 torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.23.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.0.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.input_layernorm.weight torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.23.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.0.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.model GPTVLModel(
  (external_feature_model): MegatronVisionModel(
    (vit): InternViTModel(
      (conv1): Conv2d(3, 1024, kernel_size=(14, 14), stride=(14, 14))
      (position_embeddings): Embedding(1025, 1024)
      (decoder): TransformerBlock(
        (layers): ModuleList(
          (0-23): 24 x InternViTTransformerLayer(
            (input_layernorm): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)
            (self_attention): SelfAttention(
              (core_attention): DotProductAttention(
                (scale_mask_softmax): FusedScaleMaskSoftmax()
                (attention_dropout): Dropout(p=0.0, inplace=False)
              )
              (linear_proj): RowParallelLinear()
              (linear_qkv): ColumnParallelLinear()
            )
            (self_attn_bda): IdentityFuncOp()
            (pre_cross_attn_layernorm): IdentityOp()
            (cross_attention): IdentityOp()
            (cross_attn_bda): IdentityFuncOp()
            (pre_mlp_layernorm): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)
            (mlp): MLP(
              (linear_fc1): ColumnParallelLinear()
              (linear_fc2): RowParallelLinear()
            )
            (mlp_bda): IdentityFuncOp()
          )
        )
      )
    )
    (vision_projection): MultimodalProjector(
      (encoder): MLP(
        (linear_fc1): ColumnParallelLinear()
        (linear_fc2): RowParallelLinear()
      )
    )
    (pre_proj_layernorm): LayerNorm((4096,), eps=1e-05, elementwise_affine=True)
  )
  (embedding): LanguageModelEmbedding(
    (word_embeddings): VocabParallelEmbedding()
    (embedding_dropout): Dropout(p=0.0, inplace=False)
  )
  (rotary_pos_emb): RotaryEmbedding()
  (decoder): TransformerBlock(
    (layers): ModuleList(
      (0-47): 48 x TransformerLayer(
        (input_layernorm): RMSNorm()
        (self_attention): SelfAttention(
          (core_attention): DotProductAttention(
            (scale_mask_softmax): FusedScaleMaskSoftmax()
            (attention_dropout): Dropout(p=0.0, inplace=False)
          )
          (linear_proj): RowParallelLinear()
          (linear_qkv): ColumnParallelLinear()
          (q_layernorm): IdentityOp()
          (k_layernorm): IdentityOp()
        )
        (pre_cross_attn_layernorm): IdentityOp()
        (cross_attention): IdentityOp()
        (cross_attn_bda): IdentityFuncOp()
        (pre_mlp_layernorm): RMSNorm()
        (mlp): MLP(
          (linear_fc1): ColumnParallelLinear()
          (linear_fc2): RowParallelLinear()
        )
      )
    )
    (final_layernorm): RMSNorm()
  )
  (output_layer): ColumnParallelLinear()
)

=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.ls1 torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.6.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.input_layernorm.weight torch.Size([1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.23.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.6.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.input_layernorm.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.6.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.6.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.7.input_layernorm.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.1.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.7.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.7.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.2.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.2.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.2.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.2.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.2.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.2.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.9.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.3.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.3.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.3.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.3.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.10.input_layernorm.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.3.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.3.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.10.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.10.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.ls2 torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.11.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.input_layernorm.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.12.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.ls2 torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.12.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.12.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.12.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.ls2 torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.12.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.13.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.13.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.input_layernorm.weight torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.13.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.13.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.14.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.ls2 torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.14.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.14.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.6.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.6.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.6.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.6.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.14.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.14.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.15.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.15.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.15.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.6.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.15.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.6.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
model GPTVLModel(
  (external_feature_model): MegatronVisionModel(
    (vit): InternViTModel(
      (conv1): Conv2d(3, 1024, kernel_size=(14, 14), stride=(14, 14))
      (position_embeddings): Embedding(1025, 1024)
      (decoder): TransformerBlock(
        (layers): ModuleList(
          (0-23): 24 x InternViTTransformerLayer(
            (input_layernorm): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)
            (self_attention): SelfAttention(
              (core_attention): DotProductAttention(
                (scale_mask_softmax): FusedScaleMaskSoftmax()
                (attention_dropout): Dropout(p=0.0, inplace=False)
              )
              (linear_proj): RowParallelLinear()
              (linear_qkv): ColumnParallelLinear()
            )
            (self_attn_bda): IdentityFuncOp()
            (pre_cross_attn_layernorm): IdentityOp()
            (cross_attention): IdentityOp()
            (cross_attn_bda): IdentityFuncOp()
            (pre_mlp_layernorm): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)
            (mlp): MLP(
              (linear_fc1): ColumnParallelLinear()
              (linear_fc2): RowParallelLinear()
            )
            (mlp_bda): IdentityFuncOp()
          )
        )
      )
    )
    (vision_projection): MultimodalProjector(
      (encoder): MLP(
        (linear_fc1): ColumnParallelLinear()
        (linear_fc2): RowParallelLinear()
      )
    )
    (pre_proj_layernorm): LayerNorm((4096,), eps=1e-05, elementwise_affine=True)
  )
  (embedding): LanguageModelEmbedding(
    (word_embeddings): VocabParallelEmbedding()
    (embedding_dropout): Dropout(p=0.0, inplace=False)
  )
  (rotary_pos_emb): RotaryEmbedding()
  (decoder): TransformerBlock(
    (layers): ModuleList(
      (0-47): 48 x TransformerLayer(
        (input_layernorm): RMSNorm()
        (self_attention): SelfAttention(
          (core_attention): DotProductAttention(
            (scale_mask_softmax): FusedScaleMaskSoftmax()
            (attention_dropout): Dropout(p=0.0, inplace=False)
          )
          (linear_proj): RowParallelLinear()
          (linear_qkv): ColumnParallelLinear()
          (q_layernorm): IdentityOp()
          (k_layernorm): IdentityOp()
        )
        (pre_cross_attn_layernorm): IdentityOp()
        (cross_attention): IdentityOp()
        (cross_attn_bda): IdentityFuncOp()
        (pre_mlp_layernorm): RMSNorm()
        (mlp): MLP(
          (linear_fc1): ColumnParallelLinear()
          (linear_fc2): RowParallelLinear()
        )
      )
    )
    (final_layernorm): RMSNorm()
  )
  (output_layer): ColumnParallelLinear()
)=> set param external_feature_model.vit.decoder.layers.15.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.15.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.7.input_layernorm.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.16.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.model GPTVLModel(
  (external_feature_model): MegatronVisionModel(
    (vit): InternViTModel(
      (conv1): Conv2d(3, 1024, kernel_size=(14, 14), stride=(14, 14))
      (position_embeddings): Embedding(1025, 1024)
      (decoder): TransformerBlock(
        (layers): ModuleList(
          (0-23): 24 x InternViTTransformerLayer(
            (input_layernorm): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)
            (self_attention): SelfAttention(
              (core_attention): DotProductAttention(
                (scale_mask_softmax): FusedScaleMaskSoftmax()
                (attention_dropout): Dropout(p=0.0, inplace=False)
              )
              (linear_proj): RowParallelLinear()
              (linear_qkv): ColumnParallelLinear()
            )
            (self_attn_bda): IdentityFuncOp()
            (pre_cross_attn_layernorm): IdentityOp()
            (cross_attention): IdentityOp()
            (cross_attn_bda): IdentityFuncOp()
            (pre_mlp_layernorm): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)
            (mlp): MLP(
              (linear_fc1): ColumnParallelLinear()
              (linear_fc2): RowParallelLinear()
            )
            (mlp_bda): IdentityFuncOp()
          )
        )
      )
    )
    (vision_projection): MultimodalProjector(
      (encoder): MLP(
        (linear_fc1): ColumnParallelLinear()
        (linear_fc2): RowParallelLinear()
      )
    )
    (pre_proj_layernorm): LayerNorm((4096,), eps=1e-05, elementwise_affine=True)
  )
  (embedding): LanguageModelEmbedding(
    (word_embeddings): VocabParallelEmbedding()
    (embedding_dropout): Dropout(p=0.0, inplace=False)
  )
  (rotary_pos_emb): RotaryEmbedding()
  (decoder): TransformerBlock(
    (layers): ModuleList(
      (0-47): 48 x TransformerLayer(
        (input_layernorm): RMSNorm()
        (self_attention): SelfAttention(
          (core_attention): DotProductAttention(
            (scale_mask_softmax): FusedScaleMaskSoftmax()
            (attention_dropout): Dropout(p=0.0, inplace=False)
          )
          (linear_proj): RowParallelLinear()
          (linear_qkv): ColumnParallelLinear()
          (q_layernorm): IdentityOp()
          (k_layernorm): IdentityOp()
        )
        (pre_cross_attn_layernorm): IdentityOp()
        (cross_attention): IdentityOp()
        (cross_attn_bda): IdentityFuncOp()
        (pre_mlp_layernorm): RMSNorm()
        (mlp): MLP(
          (linear_fc1): ColumnParallelLinear()
          (linear_fc2): RowParallelLinear()
        )
      )
    )
    (final_layernorm): RMSNorm()
  )
  (output_layer): ColumnParallelLinear()
)
=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.16.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.17.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.17.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.17.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.17.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.8.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.17.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.17.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.8.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.18.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.18.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.ls1 torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.18.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
 > number of parameters on (tensor, pipeline) model parallel rank (0, 0): 1887497216=> set param external_feature_model.vit.decoder.layers.19.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.19.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.20.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.21.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.21.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.21.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.10.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.21.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.21.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
vision_model_freeze=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.22.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.22.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.22.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.22.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.22.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.

=> set param external_feature_model.vit.class_token torch.Size([1, 1, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.ls1 torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.12.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.conv1.weight torch.Size([1024, 3, 14, 14]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.23.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.conv1.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.input_layernorm.weight torch.Size([1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.23.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.12.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.23.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.position_embeddings.weight torch.Size([1025, 1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.0.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.0.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.0.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.12.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.12.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.13.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.13.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.13.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.13.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.13.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.13.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.0.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.0.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.14.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.14.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.input_layernorm.weight torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.14.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.ls2 torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.1.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.14.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.14.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.2.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.2.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.ls1 torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.2.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.ls2 torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.2.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.15.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.15.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.2.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.2.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.15.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.15.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.3.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.3.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.3.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.3.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.3.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.4.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.17.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.17.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.17.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.17.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.ls2 torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.17.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.5.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.6.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.6.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.6.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.6.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.6.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.6.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.input_layernorm.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.7.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.7.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.7.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.8.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.ls1 torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.8.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.ls2 torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.19.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.8.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
 > number of parameters on (tensor, pipeline) model parallel rank (3, 0): 1887497216=> set param external_feature_model.vit.decoder.layers.19.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.9.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.model GPTVLModel(
  (external_feature_model): MegatronVisionModel(
    (vit): InternViTModel(
      (conv1): Conv2d(3, 1024, kernel_size=(14, 14), stride=(14, 14))
      (position_embeddings): Embedding(1025, 1024)
      (decoder): TransformerBlock(
        (layers): ModuleList(
          (0-23): 24 x InternViTTransformerLayer(
            (input_layernorm): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)
            (self_attention): SelfAttention(
              (core_attention): DotProductAttention(
                (scale_mask_softmax): FusedScaleMaskSoftmax()
                (attention_dropout): Dropout(p=0.0, inplace=False)
              )
              (linear_proj): RowParallelLinear()
              (linear_qkv): ColumnParallelLinear()
            )
            (self_attn_bda): IdentityFuncOp()
            (pre_cross_attn_layernorm): IdentityOp()
            (cross_attention): IdentityOp()
            (cross_attn_bda): IdentityFuncOp()
            (pre_mlp_layernorm): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)
            (mlp): MLP(
              (linear_fc1): ColumnParallelLinear()
              (linear_fc2): RowParallelLinear()
            )
            (mlp_bda): IdentityFuncOp()
          )
        )
      )
    )
    (vision_projection): MultimodalProjector(
      (encoder): MLP(
        (linear_fc1): ColumnParallelLinear()
        (linear_fc2): RowParallelLinear()
      )
    )
    (pre_proj_layernorm): LayerNorm((4096,), eps=1e-05, elementwise_affine=True)
  )
  (embedding): LanguageModelEmbedding(
    (word_embeddings): VocabParallelEmbedding()
    (embedding_dropout): Dropout(p=0.0, inplace=False)
  )
  (rotary_pos_emb): RotaryEmbedding()
  (decoder): TransformerBlock(
    (layers): ModuleList(
      (0-47): 48 x TransformerLayer(
        (input_layernorm): RMSNorm()
        (self_attention): SelfAttention(
          (core_attention): DotProductAttention(
            (scale_mask_softmax): FusedScaleMaskSoftmax()
            (attention_dropout): Dropout(p=0.0, inplace=False)
          )
          (linear_proj): RowParallelLinear()
          (linear_qkv): ColumnParallelLinear()
          (q_layernorm): IdentityOp()
          (k_layernorm): IdentityOp()
        )
        (pre_cross_attn_layernorm): IdentityOp()
        (cross_attention): IdentityOp()
        (cross_attn_bda): IdentityFuncOp()
        (pre_mlp_layernorm): RMSNorm()
        (mlp): MLP(
          (linear_fc1): ColumnParallelLinear()
          (linear_fc2): RowParallelLinear()
        )
      )
    )
    (final_layernorm): RMSNorm()
  )
  (output_layer): ColumnParallelLinear()
)=> set param external_feature_model.vit.decoder.layers.20.ls1 torch.Size([1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.20.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.21.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.21.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.21.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.21.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.ls2 torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.11.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
 > number of parameters on (tensor, pipeline) model parallel rank (4, 0): 1887497216
=> set param external_feature_model.vit.decoder.layers.11.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.21.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.12.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.21.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.12.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.12.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.12.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.22.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.22.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.22.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.12.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.input_layernorm.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.13.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.ls2 torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.13.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.13.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.22.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.22.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.13.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.13.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.ls1 torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.23.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.23.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.14.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.23.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.14.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.14.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.14.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.23.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.14.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.14.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.15.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.15.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.15.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.15.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.15.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.15.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.17.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.17.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.17.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.17.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.17.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.17.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
model GPTVLModel(
  (external_feature_model): MegatronVisionModel(
    (vit): InternViTModel(
      (conv1): Conv2d(3, 1024, kernel_size=(14, 14), stride=(14, 14))
      (position_embeddings): Embedding(1025, 1024)
      (decoder): TransformerBlock(
        (layers): ModuleList(
          (0-23): 24 x InternViTTransformerLayer(
            (input_layernorm): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)
            (self_attention): SelfAttention(
              (core_attention): DotProductAttention(
                (scale_mask_softmax): FusedScaleMaskSoftmax()
                (attention_dropout): Dropout(p=0.0, inplace=False)
              )
              (linear_proj): RowParallelLinear()
              (linear_qkv): ColumnParallelLinear()
            )
            (self_attn_bda): IdentityFuncOp()
            (pre_cross_attn_layernorm): IdentityOp()
            (cross_attention): IdentityOp()
            (cross_attn_bda): IdentityFuncOp()
            (pre_mlp_layernorm): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)
            (mlp): MLP(
              (linear_fc1): ColumnParallelLinear()
              (linear_fc2): RowParallelLinear()
            )
            (mlp_bda): IdentityFuncOp()
          )
        )
      )
    )
    (vision_projection): MultimodalProjector(
      (encoder): MLP(
        (linear_fc1): ColumnParallelLinear()
        (linear_fc2): RowParallelLinear()
      )
    )
    (pre_proj_layernorm): LayerNorm((4096,), eps=1e-05, elementwise_affine=True)
  )
  (embedding): LanguageModelEmbedding(
    (word_embeddings): VocabParallelEmbedding()
    (embedding_dropout): Dropout(p=0.0, inplace=False)
  )
  (rotary_pos_emb): RotaryEmbedding()
  (decoder): TransformerBlock(
    (layers): ModuleList(
      (0-47): 48 x TransformerLayer(
        (input_layernorm): RMSNorm()
        (self_attention): SelfAttention(
          (core_attention): DotProductAttention(
            (scale_mask_softmax): FusedScaleMaskSoftmax()
            (attention_dropout): Dropout(p=0.0, inplace=False)
          )
          (linear_proj): RowParallelLinear()
          (linear_qkv): ColumnParallelLinear()
          (q_layernorm): IdentityOp()
          (k_layernorm): IdentityOp()
        )
        (pre_cross_attn_layernorm): IdentityOp()
        (cross_attention): IdentityOp()
        (cross_attn_bda): IdentityFuncOp()
        (pre_mlp_layernorm): RMSNorm()
        (mlp): MLP(
          (linear_fc1): ColumnParallelLinear()
          (linear_fc2): RowParallelLinear()
        )
      )
    )
    (final_layernorm): RMSNorm()
  )
  (output_layer): ColumnParallelLinear()
)=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.21.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.21.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.21.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.21.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.21.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.21.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.22.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.22.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.22.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.22.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
 > number of parameters on (tensor, pipeline) model parallel rank (3, 0): 1887497216=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.22.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.22.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.23.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.23.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.23.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.23.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.23.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.23.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
 > number of parameters on (tensor, pipeline) model parallel rank (7, 0): 1887497216
 > number of parameters on (tensor, pipeline) model parallel rank (1, 0): 1887497216
model GPTVLModel(
  (external_feature_model): MegatronVisionModel(
    (vit): InternViTModel(
      (conv1): Conv2d(3, 1024, kernel_size=(14, 14), stride=(14, 14))
      (position_embeddings): Embedding(1025, 1024)
      (decoder): TransformerBlock(
        (layers): ModuleList(
          (0-23): 24 x InternViTTransformerLayer(
            (input_layernorm): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)
            (self_attention): SelfAttention(
              (core_attention): DotProductAttention(
                (scale_mask_softmax): FusedScaleMaskSoftmax()
                (attention_dropout): Dropout(p=0.0, inplace=False)
              )
              (linear_proj): RowParallelLinear()
              (linear_qkv): ColumnParallelLinear()
            )
            (self_attn_bda): IdentityFuncOp()
            (pre_cross_attn_layernorm): IdentityOp()
            (cross_attention): IdentityOp()
            (cross_attn_bda): IdentityFuncOp()
            (pre_mlp_layernorm): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)
            (mlp): MLP(
              (linear_fc1): ColumnParallelLinear()
              (linear_fc2): RowParallelLinear()
            )
            (mlp_bda): IdentityFuncOp()
          )
        )
      )
    )
    (vision_projection): MultimodalProjector(
      (encoder): MLP(
        (linear_fc1): ColumnParallelLinear()
        (linear_fc2): RowParallelLinear()
      )
    )
    (pre_proj_layernorm): LayerNorm((4096,), eps=1e-05, elementwise_affine=True)
  )
  (embedding): LanguageModelEmbedding(
    (word_embeddings): VocabParallelEmbedding()
    (embedding_dropout): Dropout(p=0.0, inplace=False)
  )
  (rotary_pos_emb): RotaryEmbedding()
  (decoder): TransformerBlock(
    (layers): ModuleList(
      (0-47): 48 x TransformerLayer(
        (input_layernorm): RMSNorm()
        (self_attention): SelfAttention(
          (core_attention): DotProductAttention(
            (scale_mask_softmax): FusedScaleMaskSoftmax()
            (attention_dropout): Dropout(p=0.0, inplace=False)
          )
          (linear_proj): RowParallelLinear()
          (linear_qkv): ColumnParallelLinear()
          (q_layernorm): IdentityOp()
          (k_layernorm): IdentityOp()
        )
        (pre_cross_attn_layernorm): IdentityOp()
        (cross_attention): IdentityOp()
        (cross_attn_bda): IdentityFuncOp()
        (pre_mlp_layernorm): RMSNorm()
        (mlp): MLP(
          (linear_fc1): ColumnParallelLinear()
          (linear_fc2): RowParallelLinear()
        )
      )
    )
    (final_layernorm): RMSNorm()
  )
  (output_layer): ColumnParallelLinear()
)
 > number of parameters on (tensor, pipeline) model parallel rank (1, 0): 1887497216
 > number of parameters on (tensor, pipeline) model parallel rank (5, 0): 1887497216
vision_model_freeze
=> set param external_feature_model.vit.class_token torch.Size([1, 1, 1024]) requires grad to False.
=> set param external_feature_model.vit.conv1.weight torch.Size([1024, 3, 14, 14]) requires grad to False.
=> set param external_feature_model.vit.conv1.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.position_embeddings.weight torch.Size([1025, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.0.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.0.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.0.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.0.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.0.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.0.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.2.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.2.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.2.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.2.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.2.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.2.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.3.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.3.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.3.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.3.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.3.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.3.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.6.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.6.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.6.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.6.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.6.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.6.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
 > number of parameters on (tensor, pipeline) model parallel rank (5, 0): 1887497216
=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.12.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.12.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.12.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.12.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.12.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.12.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.13.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.13.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.13.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.13.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.13.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.13.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.14.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.14.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.14.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.14.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.14.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.14.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.15.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.15.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.15.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.15.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.15.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.15.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.17.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.17.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.17.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.17.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.17.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.17.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
model GPTVLModel(
  (external_feature_model): MegatronVisionModel(
    (vit): InternViTModel(
      (conv1): Conv2d(3, 1024, kernel_size=(14, 14), stride=(14, 14))
      (position_embeddings): Embedding(1025, 1024)
      (decoder): TransformerBlock(
        (layers): ModuleList(
          (0-23): 24 x InternViTTransformerLayer(
            (input_layernorm): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)
            (self_attention): SelfAttention(
              (core_attention): DotProductAttention(
                (scale_mask_softmax): FusedScaleMaskSoftmax()
                (attention_dropout): Dropout(p=0.0, inplace=False)
              )
              (linear_proj): RowParallelLinear()
              (linear_qkv): ColumnParallelLinear()
            )
            (self_attn_bda): IdentityFuncOp()
            (pre_cross_attn_layernorm): IdentityOp()
            (cross_attention): IdentityOp()
            (cross_attn_bda): IdentityFuncOp()
            (pre_mlp_layernorm): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)
            (mlp): MLP(
              (linear_fc1): ColumnParallelLinear()
              (linear_fc2): RowParallelLinear()
            )
            (mlp_bda): IdentityFuncOp()
          )
        )
      )
    )
    (vision_projection): MultimodalProjector(
      (encoder): MLP(
        (linear_fc1): ColumnParallelLinear()
        (linear_fc2): RowParallelLinear()
      )
    )
    (pre_proj_layernorm): LayerNorm((4096,), eps=1e-05, elementwise_affine=True)
  )
  (embedding): LanguageModelEmbedding(
    (word_embeddings): VocabParallelEmbedding()
    (embedding_dropout): Dropout(p=0.0, inplace=False)
  )
  (rotary_pos_emb): RotaryEmbedding()
  (decoder): TransformerBlock(
    (layers): ModuleList(
      (0-47): 48 x TransformerLayer(
        (input_layernorm): RMSNorm()
        (self_attention): SelfAttention(
          (core_attention): DotProductAttention(
            (scale_mask_softmax): FusedScaleMaskSoftmax()
            (attention_dropout): Dropout(p=0.0, inplace=False)
          )
          (linear_proj): RowParallelLinear()
          (linear_qkv): ColumnParallelLinear()
          (q_layernorm): IdentityOp()
          (k_layernorm): IdentityOp()
        )
        (pre_cross_attn_layernorm): IdentityOp()
        (cross_attention): IdentityOp()
        (cross_attn_bda): IdentityFuncOp()
        (pre_mlp_layernorm): RMSNorm()
        (mlp): MLP(
          (linear_fc1): ColumnParallelLinear()
          (linear_fc2): RowParallelLinear()
        )
      )
    )
    (final_layernorm): RMSNorm()
  )
  (output_layer): ColumnParallelLinear()
)=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.model GPTVLModel(
  (external_feature_model): MegatronVisionModel(
    (vit): InternViTModel(
      (conv1): Conv2d(3, 1024, kernel_size=(14, 14), stride=(14, 14))
      (position_embeddings): Embedding(1025, 1024)
      (decoder): TransformerBlock(
        (layers): ModuleList(
          (0-23): 24 x InternViTTransformerLayer(
            (input_layernorm): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)
            (self_attention): SelfAttention(
              (core_attention): DotProductAttention(
                (scale_mask_softmax): FusedScaleMaskSoftmax()
                (attention_dropout): Dropout(p=0.0, inplace=False)
              )
              (linear_proj): RowParallelLinear()
              (linear_qkv): ColumnParallelLinear()
            )
            (self_attn_bda): IdentityFuncOp()
            (pre_cross_attn_layernorm): IdentityOp()
            (cross_attention): IdentityOp()
            (cross_attn_bda): IdentityFuncOp()
            (pre_mlp_layernorm): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)
            (mlp): MLP(
              (linear_fc1): ColumnParallelLinear()
              (linear_fc2): RowParallelLinear()
            )
            (mlp_bda): IdentityFuncOp()
          )
        )
      )
    )
    (vision_projection): MultimodalProjector(
      (encoder): MLP(
        (linear_fc1): ColumnParallelLinear()
        (linear_fc2): RowParallelLinear()
      )
    )
    (pre_proj_layernorm): LayerNorm((4096,), eps=1e-05, elementwise_affine=True)
  )
  (embedding): LanguageModelEmbedding(
    (word_embeddings): VocabParallelEmbedding()
    (embedding_dropout): Dropout(p=0.0, inplace=False)
  )
  (rotary_pos_emb): RotaryEmbedding()
  (decoder): TransformerBlock(
    (layers): ModuleList(
      (0-47): 48 x TransformerLayer(
        (input_layernorm): RMSNorm()
        (self_attention): SelfAttention(
          (core_attention): DotProductAttention(
            (scale_mask_softmax): FusedScaleMaskSoftmax()
            (attention_dropout): Dropout(p=0.0, inplace=False)
          )
          (linear_proj): RowParallelLinear()
          (linear_qkv): ColumnParallelLinear()
          (q_layernorm): IdentityOp()
          (k_layernorm): IdentityOp()
        )
        (pre_cross_attn_layernorm): IdentityOp()
        (cross_attention): IdentityOp()
        (cross_attn_bda): IdentityFuncOp()
        (pre_mlp_layernorm): RMSNorm()
        (mlp): MLP(
          (linear_fc1): ColumnParallelLinear()
          (linear_fc2): RowParallelLinear()
        )
      )
    )
    (final_layernorm): RMSNorm()
  )
  (output_layer): ColumnParallelLinear()
)

=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.21.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.21.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.21.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.21.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.21.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.21.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.22.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.22.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.22.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.22.input_layernorm.bias torch.Size([1024]) requires grad to False.
 > number of parameters on (tensor, pipeline) model parallel rank (6, 0): 1887497216
=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.22.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.22.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.23.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.23.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.23.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.23.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.23.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.23.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
 > number of parameters on (tensor, pipeline) model parallel rank (7, 0): 1887497216
model GPTVLModel(
  (external_feature_model): MegatronVisionModel(
    (vit): InternViTModel(
      (conv1): Conv2d(3, 1024, kernel_size=(14, 14), stride=(14, 14))
      (position_embeddings): Embedding(1025, 1024)
      (decoder): TransformerBlock(
        (layers): ModuleList(
          (0-23): 24 x InternViTTransformerLayer(
            (input_layernorm): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)
            (self_attention): SelfAttention(
              (core_attention): DotProductAttention(
                (scale_mask_softmax): FusedScaleMaskSoftmax()
                (attention_dropout): Dropout(p=0.0, inplace=False)
              )
              (linear_proj): RowParallelLinear()
              (linear_qkv): ColumnParallelLinear()
            )
            (self_attn_bda): IdentityFuncOp()
            (pre_cross_attn_layernorm): IdentityOp()
            (cross_attention): IdentityOp()
            (cross_attn_bda): IdentityFuncOp()
            (pre_mlp_layernorm): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)
            (mlp): MLP(
              (linear_fc1): ColumnParallelLinear()
              (linear_fc2): RowParallelLinear()
            )
            (mlp_bda): IdentityFuncOp()
          )
        )
      )
    )
    (vision_projection): MultimodalProjector(
      (encoder): MLP(
        (linear_fc1): ColumnParallelLinear()
        (linear_fc2): RowParallelLinear()
      )
    )
    (pre_proj_layernorm): LayerNorm((4096,), eps=1e-05, elementwise_affine=True)
  )
  (embedding): LanguageModelEmbedding(
    (word_embeddings): VocabParallelEmbedding()
    (embedding_dropout): Dropout(p=0.0, inplace=False)
  )
  (rotary_pos_emb): RotaryEmbedding()
  (decoder): TransformerBlock(
    (layers): ModuleList(
      (0-47): 48 x TransformerLayer(
        (input_layernorm): RMSNorm()
        (self_attention): SelfAttention(
          (core_attention): DotProductAttention(
            (scale_mask_softmax): FusedScaleMaskSoftmax()
            (attention_dropout): Dropout(p=0.0, inplace=False)
          )
          (linear_proj): RowParallelLinear()
          (linear_qkv): ColumnParallelLinear()
          (q_layernorm): IdentityOp()
          (k_layernorm): IdentityOp()
        )
        (pre_cross_attn_layernorm): IdentityOp()
        (cross_attention): IdentityOp()
        (cross_attn_bda): IdentityFuncOp()
        (pre_mlp_layernorm): RMSNorm()
        (mlp): MLP(
          (linear_fc1): ColumnParallelLinear()
          (linear_fc2): RowParallelLinear()
        )
      )
    )
    (final_layernorm): RMSNorm()
  )
  (output_layer): ColumnParallelLinear()
)
 > number of parameters on (tensor, pipeline) model parallel rank (2, 0): 1887497216
 > number of parameters on (tensor, pipeline) model parallel rank (0, 0): 1887497216
 > number of parameters on (tensor, pipeline) model parallel rank (4, 0): 1887497216
vision_model_freeze
=> set param external_feature_model.vit.class_token torch.Size([1, 1, 1024]) requires grad to False.
=> set param external_feature_model.vit.conv1.weight torch.Size([1024, 3, 14, 14]) requires grad to False.
=> set param external_feature_model.vit.conv1.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.position_embeddings.weight torch.Size([1025, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.0.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.0.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.0.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.0.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.0.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.0.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.2.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.2.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.2.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.2.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.2.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.2.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.3.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.3.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.3.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.3.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.3.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.3.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.6.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.6.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.6.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.6.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.6.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.6.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.12.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.12.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.12.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.12.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.12.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.12.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.13.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.13.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.13.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.13.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.13.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.13.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.14.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.14.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.14.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.14.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.14.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.14.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.15.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.15.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.15.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.15.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.15.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.15.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.17.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.17.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.17.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.17.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.17.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.17.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.21.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.21.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.21.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.21.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.21.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.21.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.22.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.22.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.22.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.22.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.22.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.22.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.23.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.23.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.23.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.23.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.23.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.23.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
model GPTVLModel(
  (external_feature_model): MegatronVisionModel(
    (vit): InternViTModel(
      (conv1): Conv2d(3, 1024, kernel_size=(14, 14), stride=(14, 14))
      (position_embeddings): Embedding(1025, 1024)
      (decoder): TransformerBlock(
        (layers): ModuleList(
          (0-23): 24 x InternViTTransformerLayer(
            (input_layernorm): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)
            (self_attention): SelfAttention(
              (core_attention): DotProductAttention(
                (scale_mask_softmax): FusedScaleMaskSoftmax()
                (attention_dropout): Dropout(p=0.0, inplace=False)
              )
              (linear_proj): RowParallelLinear()
              (linear_qkv): ColumnParallelLinear()
            )
            (self_attn_bda): IdentityFuncOp()
            (pre_cross_attn_layernorm): IdentityOp()
            (cross_attention): IdentityOp()
            (cross_attn_bda): IdentityFuncOp()
            (pre_mlp_layernorm): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)
            (mlp): MLP(
              (linear_fc1): ColumnParallelLinear()
              (linear_fc2): RowParallelLinear()
            )
            (mlp_bda): IdentityFuncOp()
          )
        )
      )
    )
    (vision_projection): MultimodalProjector(
      (encoder): MLP(
        (linear_fc1): ColumnParallelLinear()
        (linear_fc2): RowParallelLinear()
      )
    )
    (pre_proj_layernorm): LayerNorm((4096,), eps=1e-05, elementwise_affine=True)
  )
  (embedding): LanguageModelEmbedding(
    (word_embeddings): VocabParallelEmbedding()
    (embedding_dropout): Dropout(p=0.0, inplace=False)
  )
  (rotary_pos_emb): RotaryEmbedding()
  (decoder): TransformerBlock(
    (layers): ModuleList(
      (0-47): 48 x TransformerLayer(
        (input_layernorm): RMSNorm()
        (self_attention): SelfAttention(
          (core_attention): DotProductAttention(
            (scale_mask_softmax): FusedScaleMaskSoftmax()
            (attention_dropout): Dropout(p=0.0, inplace=False)
          )
          (linear_proj): RowParallelLinear()
          (linear_qkv): ColumnParallelLinear()
          (q_layernorm): IdentityOp()
          (k_layernorm): IdentityOp()
        )
        (pre_cross_attn_layernorm): IdentityOp()
        (cross_attention): IdentityOp()
        (cross_attn_bda): IdentityFuncOp()
        (pre_mlp_layernorm): RMSNorm()
        (mlp): MLP(
          (linear_fc1): ColumnParallelLinear()
          (linear_fc2): RowParallelLinear()
        )
      )
    )
    (final_layernorm): RMSNorm()
  )
  (output_layer): ColumnParallelLinear()
)
 > number of parameters on (tensor, pipeline) model parallel rank (2, 0): 1887497216
INFO:megatron.core.distributed.param_and_grad_buffer:Number of buckets for gradient all-reduce / reduce-scatter: 39
INFO:megatron.core.distributed.param_and_grad_buffer:Params for bucket 1 (97320960 elements):
INFO:megatron.core.distributed.param_and_grad_buffer:    module.output_layer.weight
INFO:megatron.core.distributed.param_and_grad_buffer:Params for bucket 2 (43270016 elements):
INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.final_layernorm.weight
INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.47.self_attention.linear_qkv.bias
INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.47.pre_mlp_layernorm.weight
INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.46.mlp.linear_fc2.weight
INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.47.mlp.linear_fc2.weight
INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.47.mlp.linear_fc1.weight
INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.47.self_attention.linear_qkv.weight
INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.47.self_attention.linear_proj.weight
INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.47.input_layernorm.weight
INFO:megatron.core.distributed.param_and_grad_buffer:Params for bucket 3 (52112256 elements):
INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.46.mlp.linear_fc1.weight
INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.46.self_attention.linear_qkv.weight
INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.46.self_attention.linear_proj.weight
INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.46.input_layernorm.weight
INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.46.pre_mlp_layernorm.weight
INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.46.self_attention.linear_qkv.bias
INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.45.mlp.linear_fc2.weight
INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.45.mlp.linear_fc1.weight
INFO:megatron.core.distributed.param_and_grad_buffer:Params for bucket 4 (42287872 elements):
INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.45.pre_mlp_layernorm.weight
INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.45.self_attention.linear_qkv.bias
INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.44.mlp.linear_fc2.weight
INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.44.mlp.linear_fc1.weight
INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.44.self_attention.linear_qkv.weight
INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.44.self_attention.linear_proj.weight
INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.45.self_attention.linear_proj.weight
INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.45.input_layernorm.weight
INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.45.self_attention.linear_qkv.weight
INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.44.pre_mlp_layernorm.weight
INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.44.self_attention.linear_qkv.bias
INFO:megatron.core.distributed.param_and_grad_buffer:Params for bucket 5 (43270016 elements):
INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.44.input_layernorm.weight
INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.43.self_attention.linear_qkv.bias
INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.43.pre_mlp_layernorm.weight
INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.42.mlp.linear_fc2.weight
INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.43.mlp.linear_fc2.weight
INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.43.mlp.linear_fc1.weight
INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.43.self_attention.linear_qkv.weight
INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.43.self_attention.linear_proj.weight
INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.43.input_layernorm.weight
INFO:megatron.core.distributed.param_and_grad_buffer:Params for bucket 6 (52112256 elements):
INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.42.mlp.linear_fc1.weight
INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.42.self_attention.linear_proj.weight
INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.42.self_attention.linear_qkv.weight
INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.42.input_layernorm.weight
INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.42.pre_mlp_layernorm.weight
INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.42.self_attention.linear_qkv.bias
INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.41.mlp.linear_fc2.weight
INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.41.mlp.linear_fc1.weight
INFO:megatron.core.distributed.param_and_grad_buffer:Params for bucket 7 (42287872 elements):
INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.41.pre_mlp_layernorm.weight
INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.41.self_attention.linear_qkv.bias
INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.40.mlp.linear_fc2.weight
INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.40.mlp.linear_fc1.weight
INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.40.self_attention.linear_qkv.weight
INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.40.self_attention.linear_proj.weight
INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.41.input_layernorm.weight
INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.41.self_attention.linear_proj.weight
INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.41.self_attention.linear_qkv.weight
INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.40.pre_mlp_layernorm.weight
INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.40.self_attention.linear_qkv.bias
INFO:megatron.core.distributed.param_and_grad_buffer:Params for bucket 8 (43270016 elements):
INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.40.input_layernorm.weight
INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.39.self_attention.linear_qkv.bias
INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.39.pre_mlp_layernorm.weight
INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.38.mlp.linear_fc2.weight
INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.39.mlp.linear_fc1.weight
INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.39.mlp.linear_fc2.weight
INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.39.self_attention.linear_qkv.weight
INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.39.self_attention.linear_proj.weight
INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.39.input_layernorm.weight
INFO:megatron.core.distributed.param_and_grad_buffer:Params for bucket 9 (52112256 elements):
INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.38.mlp.linear_fc1.weight
INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.38.self_attention.linear_proj.weight
INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.38.self_attention.linear_qkv.weight
INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.38.input_layernorm.weight
INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.38.self_attention.linear_qkv.bias
INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.38.pre_mlp_layernorm.weight
INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.37.mlp.linear_fc2.weight
INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.37.mlp.linear_fc1.weight
INFO:megatron.core.distributed.param_and_grad_buffer:Params for bucket 10 (42287872 elements):
INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.37.pre_mlp_layernorm.weight
INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.37.self_attention.linear_qkv.bias
INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.36.mlp.linear_fc2.weight
INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.36.mlp.linear_fc1.weight
INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.36.self_attention.linear_qkv.weight
INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.36.self_attention.linear_proj.weight
INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.37.self_attention.linear_qkv.weight
INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.37.input_layernorm.weight
INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.37.self_attention.linear_proj.weight
INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.36.pre_mlp_layernorm.weight
INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.36.self_attention.linear_qkv.bias
INFO:megatron.core.distributed.param_and_grad_buffer:Params for bucket 11 (43270016 elements):
INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.36.input_layernorm.weight
INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.35.pre_mlp_layernorm.weight
INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.35.self_attention.linear_qkv.bias
INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.34.mlp.linear_fc2.weight
INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.35.mlp.linear_fc2.weight
INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.35.mlp.linear_fc1.weight
INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.35.self_attention.linear_qkv.weight
INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.35.self_attention.linear_proj.weight
INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.35.input_layernorm.weight
INFO:megatron.core.distributed.param_and_grad_buffer:Params for bucket 12 (52112256 elements):
INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.34.mlp.linear_fc1.weight
INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.34.self_attention.linear_proj.weight
INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.34.self_attention.linear_qkv.weight
INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.34.input_layernorm.weight
INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.34.self_attention.linear_qkv.bias
INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.34.pre_mlp_layernorm.weight
INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.33.mlp.linear_fc2.weight
INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.33.mlp.linear_fc1.weight
INFO:megatron.core.distributed.param_and_grad_buffer:Params for bucket 13 (42287872 elements):
INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.33.pre_mlp_layernorm.weight
INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.33.self_attention.linear_qkv.bias
INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.32.mlp.linear_fc2.weight
INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.32.mlp.linear_fc1.weight
INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.32.self_attention.linear_qkv.weight
INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.32.self_attention.linear_proj.weight
INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.33.self_attention.linear_qkv.weight
INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.33.self_attention.linear_proj.weight
INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.33.input_layernorm.weight
INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.32.pre_mlp_layernorm.weight
INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.32.self_attention.linear_qkv.bias
INFO:megatron.core.distributed.param_and_grad_buffer:Params for bucket 14 (43270016 elements):
INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.32.input_layernorm.weight
INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.31.self_attention.linear_qkv.bias
INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.31.pre_mlp_layernorm.weight
INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.30.mlp.linear_fc2.weight
INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.31.mlp.linear_fc2.weight
INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.31.mlp.linear_fc1.weight
INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.31.self_attention.linear_qkv.weight
INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.31.self_attention.linear_proj.weight
INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.31.input_layernorm.weight
INFO:megatron.core.distributed.param_and_grad_buffer:Params for bucket 15 (52112256 elements):
INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.30.mlp.linear_fc1.weight
INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.30.self_attention.linear_proj.weight
INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.30.self_attention.linear_qkv.weight
INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.30.input_layernorm.weight
INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.29.mlp.linear_fc2.weight
INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.30.self_attention.linear_qkv.bias
INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.30.pre_mlp_layernorm.weight
INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.29.mlp.linear_fc1.weight
INFO:megatron.core.distributed.param_and_grad_buffer:Params for bucket 16 (42287872 elements):
INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.29.pre_mlp_layernorm.weight
INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.29.self_attention.linear_qkv.bias
INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.28.mlp.linear_fc2.weight
INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.28.mlp.linear_fc1.weight
INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.28.self_attention.linear_qkv.weight
INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.28.self_attention.linear_proj.weight
INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.29.self_attention.linear_qkv.weight
INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.29.self_attention.linear_proj.weight
INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.29.input_layernorm.weight
INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.28.pre_mlp_layernorm.weight
INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.28.self_attention.linear_qkv.bias
INFO:megatron.core.distributed.param_and_grad_buffer:Params for bucket 17 (43270016 elements):
INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.28.input_layernorm.weight
INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.27.self_attention.linear_qkv.bias
INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.27.pre_mlp_layernorm.weight
INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.26.mlp.linear_fc2.weight
INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.27.mlp.linear_fc2.weight
INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.27.mlp.linear_fc1.weight
INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.27.self_attention.linear_qkv.weight
INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.27.self_attention.linear_proj.weight
INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.27.input_layernorm.weight
INFO:megatron.core.distributed.param_and_grad_buffer:Params for bucket 18 (52112256 elements):
INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.26.mlp.linear_fc1.weight
INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.26.self_attention.linear_qkv.weight
INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.26.self_attention.linear_proj.weight
INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.26.input_layernorm.weight
INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.26.pre_mlp_layernorm.weight
INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.26.self_attention.linear_qkv.bias
INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.25.mlp.linear_fc2.weight
INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.25.mlp.linear_fc1.weight
INFO:megatron.core.distributed.param_and_grad_buffer:Params for bucket 19 (42287872 elements):
INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.25.pre_mlp_layernorm.weight
INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.25.self_attention.linear_qkv.bias
INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.24.mlp.linear_fc2.weight
_get_param_groups name module.module.external_feature_model.vision_projection.encoder.linear_fc1.weight key (1.0, 1.0, False, False)
INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.24.mlp.linear_fc1.weight
INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.24.self_attention.linear_qkv.weight
INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.24.self_attention.linear_proj.weight
INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.25.self_attention.linear_qkv.weight
INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.25.self_attention.linear_proj.weight
INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.25.input_layernorm.weight
INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.24.pre_mlp_layernorm.weight
INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.24.self_attention.linear_qkv.bias
_get_param_groups name module.module.external_feature_model.vision_projection.encoder.linear_fc2.weight key (1.0, 1.0, False, False)INFO:megatron.core.distributed.param_and_grad_buffer:Params for bucket 20 (43270016 elements):

INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.24.input_layernorm.weight
_get_param_groups name module.module.external_feature_model.pre_proj_layernorm.weight key (0.0, 1.0, False, False)
INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.23.pre_mlp_layernorm.weight
_get_param_groups name module.module.external_feature_model.pre_proj_layernorm.bias key (0.0, 1.0, False, False)INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.23.self_attention.linear_qkv.bias

INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.22.mlp.linear_fc2.weight
_get_param_groups name module.module.embedding.word_embeddings.weight key (1.0, 1.0, False, False)
INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.23.mlp.linear_fc2.weight
_get_param_groups name module.module.decoder.layers.0.input_layernorm.weight key (0.0, 1.0, False, False)
INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.23.mlp.linear_fc1.weight
INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.23.self_attention.linear_qkv.weight
_get_param_groups name module.module.decoder.layers.0.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.23.self_attention.linear_proj.weight
_get_param_groups name module.module.decoder.layers.0.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.0.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.23.input_layernorm.weight

_get_param_groups name module.module.decoder.layers.0.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
INFO:megatron.core.distributed.param_and_grad_buffer:Params for bucket 21 (52112256 elements):
_get_param_groups name module.module.decoder.layers.0.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.22.mlp.linear_fc1.weight
_get_param_groups name module.module.decoder.layers.0.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.22.self_attention.linear_qkv.weight
_get_param_groups name module.module.decoder.layers.1.input_layernorm.weight key (0.0, 1.0, False, False)
INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.22.self_attention.linear_proj.weight
_get_param_groups name module.module.decoder.layers.1.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.22.input_layernorm.weight
_get_param_groups name module.module.decoder.layers.1.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.22.self_attention.linear_qkv.bias
_get_param_groups name module.module.decoder.layers.1.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.22.pre_mlp_layernorm.weight
_get_param_groups name module.module.decoder.layers.1.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.21.mlp.linear_fc2.weight
_get_param_groups name module.module.decoder.layers.1.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.21.mlp.linear_fc1.weight_get_param_groups name module.module.external_feature_model.vision_projection.encoder.linear_fc1.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.1.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.2.input_layernorm.weight key (0.0, 1.0, False, False)
INFO:megatron.core.distributed.param_and_grad_buffer:Params for bucket 22 (42287872 elements):
_get_param_groups name module.module.decoder.layers.2.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.21.pre_mlp_layernorm.weight
_get_param_groups name module.module.decoder.layers.2.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.21.self_attention.linear_qkv.bias
_get_param_groups name module.module.decoder.layers.2.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.20.mlp.linear_fc2.weight
_get_param_groups name module.module.decoder.layers.2.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.20.mlp.linear_fc1.weight
_get_param_groups name module.module.decoder.layers.2.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.20.self_attention.linear_qkv.weight
_get_param_groups name module.module.decoder.layers.2.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.20.self_attention.linear_proj.weight_get_param_groups name module.module.decoder.layers.3.input_layernorm.weight key (0.0, 1.0, False, False)

INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.21.input_layernorm.weight_get_param_groups name module.module.decoder.layers.3.self_attention.linear_proj.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.3.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.21.self_attention.linear_proj.weight
_get_param_groups name module.module.external_feature_model.vision_projection.encoder.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.3.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.21.self_attention.linear_qkv.weight


INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.20.pre_mlp_layernorm.weight
_get_param_groups name module.module.decoder.layers.3.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.external_feature_model.pre_proj_layernorm.weight key (0.0, 1.0, False, False)INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.20.self_attention.linear_qkv.bias
_get_param_groups name module.module.decoder.layers.3.mlp.linear_fc1.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.3.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.external_feature_model.pre_proj_layernorm.bias key (0.0, 1.0, False, False)
INFO:megatron.core.distributed.param_and_grad_buffer:Params for bucket 23 (43270016 elements):
_get_param_groups name module.module.decoder.layers.4.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.embedding.word_embeddings.weight key (1.0, 1.0, False, False)
INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.20.input_layernorm.weight
_get_param_groups name module.module.decoder.layers.4.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.19.pre_mlp_layernorm.weight
_get_param_groups name module.module.decoder.layers.4.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.0.input_layernorm.weight key (0.0, 1.0, False, False)
INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.19.self_attention.linear_qkv.bias_get_param_groups name module.module.decoder.layers.4.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.0.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.18.mlp.linear_fc2.weight
_get_param_groups name module.module.decoder.layers.4.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.0.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.4.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.19.mlp.linear_fc1.weight
_get_param_groups name module.module.decoder.layers.0.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.4.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.19.mlp.linear_fc2.weight_get_param_groups name module.module.decoder.layers.5.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.0.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)


INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.19.self_attention.linear_qkv.weight_get_param_groups name module.module.decoder.layers.0.mlp.linear_fc1.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.5.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.5.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.0.mlp.linear_fc2.weight key (1.0, 1.0, False, False)INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.19.self_attention.linear_proj.weight


_get_param_groups name module.module.decoder.layers.5.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.19.input_layernorm.weight
_get_param_groups name module.module.decoder.layers.1.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.5.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.1.self_attention.linear_proj.weight key (1.0, 1.0, False, False)INFO:megatron.core.distributed.param_and_grad_buffer:Params for bucket 24 (52112256 elements):_get_param_groups name module.module.decoder.layers.5.mlp.linear_fc1.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.5.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.1.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)

INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.18.mlp.linear_fc1.weight
_get_param_groups name module.module.decoder.layers.1.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.6.input_layernorm.weight key (0.0, 1.0, False, False)

INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.18.self_attention.linear_qkv.weight
_get_param_groups name module.module.decoder.layers.6.self_attention.linear_proj.weight key (1.0, 1.0, False, False)INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.18.self_attention.linear_proj.weight_get_param_groups name module.module.decoder.layers.1.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.6.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.1.mlp.linear_fc1.weight key (1.0, 1.0, False, False)INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.18.input_layernorm.weight


_get_param_groups name module.module.decoder.layers.6.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.1.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.18.self_attention.linear_qkv.bias
_get_param_groups name module.module.decoder.layers.2.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.6.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.18.pre_mlp_layernorm.weight
_get_param_groups name module.module.decoder.layers.6.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.2.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.17.mlp.linear_fc2.weight
_get_param_groups name module.module.decoder.layers.6.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.2.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.17.mlp.linear_fc1.weight
_get_param_groups name module.module.decoder.layers.2.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.7.input_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.7.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.2.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)INFO:megatron.core.distributed.param_and_grad_buffer:Params for bucket 25 (42287872 elements):


_get_param_groups name module.module.decoder.layers.7.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.2.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.17.self_attention.linear_qkv.bias
_get_param_groups name module.module.decoder.layers.7.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.2.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.16.mlp.linear_fc2.weight
_get_param_groups name module.module.decoder.layers.7.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.3.input_layernorm.weight key (0.0, 1.0, False, False)

INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.16.mlp.linear_fc1.weight
_get_param_groups name module.module.decoder.layers.7.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.3.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.16.self_attention.linear_qkv.weight
_get_param_groups name module.module.decoder.layers.7.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.3.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.16.self_attention.linear_proj.weight
_get_param_groups name module.module.decoder.layers.8.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.3.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.17.pre_mlp_layernorm.weight
_get_param_groups name module.module.decoder.layers.8.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.3.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.17.self_attention.linear_qkv.weight
_get_param_groups name module.module.decoder.layers.8.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.3.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.8.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.17.input_layernorm.weight
_get_param_groups name module.module.decoder.layers.3.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.17.self_attention.linear_proj.weight_get_param_groups name module.module.decoder.layers.8.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.4.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.8.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.16.pre_mlp_layernorm.weight
_get_param_groups name module.module.decoder.layers.4.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.8.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.4.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.16.self_attention.linear_qkv.bias
_get_param_groups name module.module.decoder.layers.9.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.4.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.9.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.4.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.4.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.9.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.9.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.4.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.5.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.9.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.9.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.5.self_attention.linear_proj.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.9.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.5.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.10.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.5.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.10.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.10.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.5.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.10.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
INFO:megatron.core.distributed.param_and_grad_buffer:Params for bucket 26 (43270016 elements):
_get_param_groups name module.module.decoder.layers.5.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.10.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.5.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.10.mlp.linear_fc1.weight key (1.0, 1.0, False, False)INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.16.input_layernorm.weight

_get_param_groups name module.module.decoder.layers.6.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.10.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.15.pre_mlp_layernorm.weight
_get_param_groups name module.module.decoder.layers.11.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.6.self_attention.linear_proj.weight key (1.0, 1.0, False, False)

INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.15.self_attention.linear_qkv.bias
_get_param_groups name module.module.decoder.layers.6.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.14.mlp.linear_fc2.weight_get_param_groups name module.module.decoder.layers.11.self_attention.linear_proj.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.6.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.11.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.15.mlp.linear_fc1.weight
_get_param_groups name module.module.decoder.layers.6.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.11.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)

INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.15.mlp.linear_fc2.weight
_get_param_groups name module.module.decoder.layers.6.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.11.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.15.self_attention.linear_qkv.weight

_get_param_groups name module.module.decoder.layers.6.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.11.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.15.self_attention.linear_proj.weight
_get_param_groups name module.module.decoder.layers.7.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.11.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.15.input_layernorm.weight
_get_param_groups name module.module.decoder.layers.7.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.12.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.7.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
INFO:megatron.core.distributed.param_and_grad_buffer:Params for bucket 27 (52112256 elements):
_get_param_groups name module.module.decoder.layers.12.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.7.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.12.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.14.mlp.linear_fc1.weight
_get_param_groups name module.module.decoder.layers.7.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.12.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.14.self_attention.linear_proj.weight
_get_param_groups name module.module.decoder.layers.7.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.12.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.14.self_attention.linear_qkv.weight
_get_param_groups name module.module.decoder.layers.7.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.12.mlp.linear_fc1.weight key (1.0, 1.0, False, False)INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.14.input_layernorm.weight

_get_param_groups name module.module.decoder.layers.8.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.12.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.14.self_attention.linear_qkv.bias
_get_param_groups name module.module.decoder.layers.8.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.13.input_layernorm.weight key (0.0, 1.0, False, False)
INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.14.pre_mlp_layernorm.weight
_get_param_groups name module.module.decoder.layers.8.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.8.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.13.self_attention.linear_proj.weight key (1.0, 1.0, False, False)

INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.13.mlp.linear_fc2.weight
_get_param_groups name module.module.decoder.layers.13.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.13.mlp.linear_fc1.weight_get_param_groups name module.module.decoder.layers.8.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.13.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.8.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
INFO:megatron.core.distributed.param_and_grad_buffer:Params for bucket 28 (42287872 elements):_get_param_groups name module.module.decoder.layers.13.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.8.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.13.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.9.input_layernorm.weight key (0.0, 1.0, False, False)INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.13.pre_mlp_layernorm.weight

_get_param_groups name module.module.decoder.layers.13.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.9.self_attention.linear_proj.weight key (1.0, 1.0, False, False)INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.13.self_attention.linear_qkv.bias

_get_param_groups name module.module.decoder.layers.14.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.9.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.12.mlp.linear_fc2.weight
_get_param_groups name module.module.decoder.layers.14.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.9.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.14.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.12.mlp.linear_fc1.weight

_get_param_groups name module.module.external_feature_model.vision_projection.encoder.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.9.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.14.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.12.self_attention.linear_qkv.weight


_get_param_groups name module.module.external_feature_model.vision_projection.encoder.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.9.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.12.self_attention.linear_proj.weight_get_param_groups name module.module.decoder.layers.14.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.9.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.14.mlp.linear_fc1.weight key (1.0, 1.0, False, False)INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.13.self_attention.linear_qkv.weight

_get_param_groups name module.module.decoder.layers.10.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.14.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.13.self_attention.linear_proj.weight
_get_param_groups name module.module.decoder.layers.10.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.15.input_layernorm.weight key (0.0, 1.0, False, False)
INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.13.input_layernorm.weight
_get_param_groups name module.module.decoder.layers.10.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.15.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.12.pre_mlp_layernorm.weight
_get_param_groups name module.module.decoder.layers.10.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.15.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.12.self_attention.linear_qkv.bias
_get_param_groups name module.module.decoder.layers.15.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.10.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.external_feature_model.vision_projection.encoder.linear_fc1.weight key (1.0, 1.0, False, False)
INFO:megatron.core.distributed.param_and_grad_buffer:Params for bucket 29 (43270016 elements):
_get_param_groups name module.module.decoder.layers.10.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.15.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.12.input_layernorm.weight
_get_param_groups name module.module.decoder.layers.10.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.15.mlp.linear_fc1.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.15.mlp.linear_fc2.weight key (1.0, 1.0, False, False)INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.11.pre_mlp_layernorm.weight
_get_param_groups name module.module.decoder.layers.11.input_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.external_feature_model.vision_projection.encoder.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.external_feature_model.vision_projection.encoder.linear_fc2.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.16.input_layernorm.weight key (0.0, 1.0, False, False)
INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.11.self_attention.linear_qkv.bias
_get_param_groups name module.module.decoder.layers.11.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.external_feature_model.vision_projection.encoder.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.16.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.11.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.10.mlp.linear_fc2.weight

_get_param_groups name module.module.external_feature_model.pre_proj_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.16.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.11.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.11.mlp.linear_fc1.weight
_get_param_groups name module.module.decoder.layers.16.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.external_feature_model.pre_proj_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.external_feature_model.pre_proj_layernorm.bias key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.11.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.11.mlp.linear_fc2.weight

_get_param_groups name module.module.decoder.layers.16.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.external_feature_model.pre_proj_layernorm.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.embedding.word_embeddings.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.11.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.16.mlp.linear_fc1.weight key (1.0, 1.0, False, False)INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.11.self_attention.linear_qkv.weight

_get_param_groups name module.module.embedding.word_embeddings.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.11.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.16.mlp.linear_fc2.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.0.input_layernorm.weight key (0.0, 1.0, False, False)INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.11.self_attention.linear_proj.weight

_get_param_groups name module.module.decoder.layers.17.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.12.input_layernorm.weight key (0.0, 1.0, False, False)
INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.11.input_layernorm.weight
_get_param_groups name module.module.decoder.layers.0.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.0.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.external_feature_model.vision_projection.encoder.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.17.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.12.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.0.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.0.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.17.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)INFO:megatron.core.distributed.param_and_grad_buffer:Params for bucket 30 (52112256 elements):

_get_param_groups name module.module.decoder.layers.12.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.0.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.17.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.external_feature_model.pre_proj_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.0.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.12.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.10.mlp.linear_fc1.weight
_get_param_groups name module.module.external_feature_model.vision_projection.encoder.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.0.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.0.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.17.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.external_feature_model.pre_proj_layernorm.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.12.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)

INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.10.self_attention.linear_proj.weight
_get_param_groups name module.module.decoder.layers.0.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.17.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.12.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.0.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.10.self_attention.linear_qkv.weight
_get_param_groups name module.module.embedding.word_embeddings.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.0.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.17.mlp.linear_fc2.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.external_feature_model.pre_proj_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.12.mlp.linear_fc2.weight key (1.0, 1.0, False, False)

INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.10.input_layernorm.weight

_get_param_groups name module.module.decoder.layers.0.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.1.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.18.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.13.input_layernorm.weight key (0.0, 1.0, False, False)
INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.10.self_attention.linear_qkv.bias_get_param_groups name module.module.external_feature_model.pre_proj_layernorm.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.0.input_layernorm.weight key (0.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.1.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.0.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.18.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.13.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.10.pre_mlp_layernorm.weight
_get_param_groups name module.module.decoder.layers.1.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.embedding.word_embeddings.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.18.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.1.input_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.0.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.13.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.1.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.9.mlp.linear_fc2.weight

_get_param_groups name module.module.decoder.layers.13.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.18.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.0.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.1.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.9.mlp.linear_fc1.weight

_get_param_groups name module.module.decoder.layers.0.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.1.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.13.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.0.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.1.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.1.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.18.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
INFO:megatron.core.distributed.param_and_grad_buffer:Params for bucket 31 (42287872 elements):_get_param_groups name module.module.decoder.layers.0.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.13.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.1.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.1.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.0.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.18.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.9.pre_mlp_layernorm.weight_get_param_groups name module.module.decoder.layers.13.mlp.linear_fc2.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.0.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.1.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.18.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.2.input_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.0.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.0.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.14.input_layernorm.weight key (0.0, 1.0, False, False)INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.9.self_attention.linear_qkv.bias


_get_param_groups name module.module.decoder.layers.1.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.19.input_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.0.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.2.self_attention.linear_proj.weight key (1.0, 1.0, False, False)

INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.8.mlp.linear_fc2.weight_get_param_groups name module.module.decoder.layers.1.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.14.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.0.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.19.self_attention.linear_proj.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.1.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.2.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.14.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.8.mlp.linear_fc1.weight
_get_param_groups name module.module.decoder.layers.2.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.0.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.2.input_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.19.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.1.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.14.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.0.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.19.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.2.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.8.self_attention.linear_qkv.weight
_get_param_groups name module.module.decoder.layers.2.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.1.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.14.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.1.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.2.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.19.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.8.self_attention.linear_proj.weight_get_param_groups name module.module.decoder.layers.1.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.2.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.14.mlp.linear_fc1.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.2.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.19.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.1.self_attention.linear_proj.weight key (1.0, 1.0, False, False)

INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.9.self_attention.linear_qkv.weight_get_param_groups name module.module.decoder.layers.14.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.2.mlp.linear_fc2.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.1.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.2.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.19.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.1.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.15.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.3.input_layernorm.weight key (0.0, 1.0, False, False)INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.9.self_attention.linear_proj.weight

_get_param_groups name module.module.decoder.layers.1.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.1.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.20.input_layernorm.weight key (0.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.2.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.9.input_layernorm.weight_get_param_groups name module.module.decoder.layers.15.self_attention.linear_proj.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.3.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.1.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.2.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.1.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.20.self_attention.linear_proj.weight key (1.0, 1.0, False, False)


INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.8.pre_mlp_layernorm.weight
_get_param_groups name module.module.decoder.layers.15.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.3.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.2.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.3.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.1.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.20.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.3.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)

INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.8.self_attention.linear_qkv.bias_get_param_groups name module.module.decoder.layers.15.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.2.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.1.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.20.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.3.self_attention.linear_proj.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.3.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.15.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.2.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.2.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.3.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
INFO:megatron.core.distributed.param_and_grad_buffer:Params for bucket 32 (43270016 elements):

_get_param_groups name module.module.decoder.layers.3.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.20.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.15.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.2.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.3.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.20.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.3.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.2.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.15.mlp.linear_fc2.weight key (1.0, 1.0, False, False)INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.8.input_layernorm.weight


_get_param_groups name module.module.decoder.layers.2.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.3.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.20.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.4.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.2.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.16.input_layernorm.weight key (0.0, 1.0, False, False)

INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.7.self_attention.linear_proj.weight_get_param_groups name module.module.decoder.layers.2.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.3.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.21.input_layernorm.weight key (0.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.2.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.4.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.16.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.3.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.2.mlp.linear_fc2.weight key (1.0, 1.0, False, False)INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.7.input_layernorm.weight


_get_param_groups name module.module.decoder.layers.21.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.4.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.2.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.16.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.4.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.3.input_layernorm.weight key (0.0, 1.0, False, False)
INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.7.mlp.linear_fc1.weight_get_param_groups name module.module.decoder.layers.21.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.4.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.2.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.16.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.21.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.4.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.7.pre_mlp_layernorm.weight
_get_param_groups name module.module.decoder.layers.2.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.3.self_attention.linear_proj.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.16.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.4.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.4.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.21.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.7.mlp.linear_fc2.weight
_get_param_groups name module.module.decoder.layers.3.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.3.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.16.mlp.linear_fc1.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.4.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.4.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.21.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.3.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.7.self_attention.linear_qkv.bias

_get_param_groups name module.module.decoder.layers.16.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.3.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.4.mlp.linear_fc2.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.21.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.4.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)

INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.7.self_attention.linear_qkv.weight_get_param_groups name module.module.decoder.layers.17.input_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.22.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.4.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.3.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.3.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.5.input_layernorm.weight key (0.0, 1.0, False, False)


INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.6.mlp.linear_fc2.weight
_get_param_groups name module.module.decoder.layers.17.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.4.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.22.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.3.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.5.self_attention.linear_proj.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.3.mlp.linear_fc1.weight key (1.0, 1.0, False, False)

INFO:megatron.core.distributed.param_and_grad_buffer:Params for bucket 33 (52112256 elements):
_get_param_groups name module.module.decoder.layers.17.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.5.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.22.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.5.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.3.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.3.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.17.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.22.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.6.pre_mlp_layernorm.weight
_get_param_groups name module.module.decoder.layers.5.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.5.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.3.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.4.input_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.17.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.6.self_attention.linear_qkv.bias_get_param_groups name module.module.decoder.layers.22.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.5.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.5.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.3.mlp.linear_fc2.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.4.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.17.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.22.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.5.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.5.mlp.linear_fc2.weight_get_param_groups name module.module.decoder.layers.4.input_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.5.mlp.linear_fc1.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.17.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.4.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.22.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.5.mlp.linear_fc1.weight
_get_param_groups name module.module.decoder.layers.5.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.4.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.5.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.18.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.4.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.23.input_layernorm.weight key (0.0, 1.0, False, False)


INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.6.mlp.linear_fc1.weight_get_param_groups name module.module.decoder.layers.6.input_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.4.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.18.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.4.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.23.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.5.mlp.linear_fc1.weight key (1.0, 1.0, False, False)INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.6.self_attention.linear_proj.weight_get_param_groups name module.module.decoder.layers.6.self_attention.linear_proj.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.4.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.5.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.18.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.23.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)

INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.6.self_attention.linear_qkv.weight_get_param_groups name module.module.decoder.layers.4.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.6.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.6.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.18.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.4.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.23.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.6.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.4.mlp.linear_fc2.weight key (1.0, 1.0, False, False)INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.6.input_layernorm.weight

_get_param_groups name module.module.decoder.layers.4.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.18.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.6.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.23.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.6.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.5.input_layernorm.weight key (0.0, 1.0, False, False)


INFO:megatron.core.distributed.param_and_grad_buffer:Params for bucket 34 (42287872 elements):_get_param_groups name module.module.decoder.layers.4.mlp.linear_fc2.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.18.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.6.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.23.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.6.mlp.linear_fc1.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.5.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.5.input_layernorm.weight key (0.0, 1.0, False, False)
INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.5.input_layernorm.weight_get_param_groups name module.module.decoder.layers.18.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.6.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.6.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.5.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.23.mlp.linear_fc2.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.5.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.19.input_layernorm.weight key (0.0, 1.0, False, False)INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.5.self_attention.linear_qkv.weight
_get_param_groups name module.module.decoder.layers.5.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.24.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.6.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.7.input_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.5.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.19.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.5.self_attention.linear_proj.weight
_get_param_groups name module.module.decoder.layers.5.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.5.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.24.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.6.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.19.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.7.self_attention.linear_proj.weight key (1.0, 1.0, False, False)


INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.4.pre_mlp_layernorm.weight
_get_param_groups name module.module.decoder.layers.24.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.5.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.5.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.19.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.6.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.7.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)


INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.4.self_attention.linear_qkv.bias
_get_param_groups name module.module.decoder.layers.5.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.5.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.24.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.7.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.7.input_layernorm.weight key (0.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.19.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.5.pre_mlp_layernorm.weight

_get_param_groups name module.module.decoder.layers.6.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.5.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.24.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.19.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.7.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.7.self_attention.linear_proj.weight key (1.0, 1.0, False, False)INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.5.self_attention.linear_qkv.bias
_get_param_groups name module.module.decoder.layers.6.input_layernorm.weight key (0.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.6.self_attention.linear_proj.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.7.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.7.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.6.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.19.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.24.mlp.linear_fc1.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.6.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.4.mlp.linear_fc2.weight

_get_param_groups name module.module.decoder.layers.7.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.7.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.6.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.20.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.24.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.6.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.4.mlp.linear_fc1.weight

_get_param_groups name module.module.decoder.layers.8.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.6.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.7.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.20.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.25.input_layernorm.weight key (0.0, 1.0, False, False)

INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.4.self_attention.linear_qkv.weight
_get_param_groups name module.module.decoder.layers.6.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.8.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.7.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.6.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.20.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.25.self_attention.linear_proj.weight key (1.0, 1.0, False, False)INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.4.self_attention.linear_proj.weight_get_param_groups name module.module.decoder.layers.6.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.8.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.7.mlp.linear_fc2.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.6.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.20.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.25.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.8.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.8.input_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.6.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.6.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.25.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.20.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)INFO:megatron.core.distributed.param_and_grad_buffer:Params for bucket 35 (43270016 elements):

_get_param_groups name module.module.decoder.layers.8.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.8.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.7.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.7.input_layernorm.weight key (0.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.25.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.20.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.3.mlp.linear_fc1.weight_get_param_groups name module.module.decoder.layers.8.mlp.linear_fc1.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.8.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.25.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.7.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.7.self_attention.linear_proj.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.20.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.8.mlp.linear_fc2.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.8.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.3.mlp.linear_fc2.weight
_get_param_groups name module.module.decoder.layers.25.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.7.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.7.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.21.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.9.input_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.8.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.3.self_attention.linear_qkv.weight

_get_param_groups name module.module.decoder.layers.26.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.7.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.7.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.21.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.9.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.8.mlp.linear_fc1.weight key (1.0, 1.0, False, False)INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.3.self_attention.linear_proj.weight

_get_param_groups name module.module.decoder.layers.26.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.7.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.21.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.7.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.9.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.8.mlp.linear_fc2.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.26.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.3.input_layernorm.weight

_get_param_groups name module.module.decoder.layers.7.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.21.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.9.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.26.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.7.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.9.input_layernorm.weight key (0.0, 1.0, False, False)

INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.4.input_layernorm.weight
_get_param_groups name module.module.decoder.layers.7.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.21.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.9.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.26.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.7.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.3.pre_mlp_layernorm.weight_get_param_groups name module.module.decoder.layers.9.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.8.input_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.21.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.9.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.26.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.8.input_layernorm.weight key (0.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.9.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.3.self_attention.linear_qkv.bias
_get_param_groups name module.module.decoder.layers.26.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.21.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.8.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.9.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.9.mlp.linear_fc2.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.8.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.2.mlp.linear_fc2.weight
_get_param_groups name module.module.decoder.layers.27.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.22.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.8.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.10.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.8.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.9.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.27.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
INFO:megatron.core.distributed.param_and_grad_buffer:Params for bucket 36 (52112256 elements):_get_param_groups name module.module.decoder.layers.22.self_attention.linear_proj.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.8.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.8.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.9.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.10.self_attention.linear_proj.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.27.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.22.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.2.self_attention.linear_qkv.bias_get_param_groups name module.module.decoder.layers.9.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.27.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.8.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.10.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.8.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.10.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.22.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.8.mlp.linear_fc1.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.27.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.10.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.8.mlp.linear_fc1.weight key (1.0, 1.0, False, False)INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.2.pre_mlp_layernorm.weight


_get_param_groups name module.module.decoder.layers.8.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.10.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.22.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.27.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.8.mlp.linear_fc2.weight key (1.0, 1.0, False, False)INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.1.mlp.linear_fc2.weight_get_param_groups name module.module.decoder.layers.10.self_attention.linear_proj.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.9.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.22.mlp.linear_fc1.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.10.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.27.mlp.linear_fc2.weight key (1.0, 1.0, False, False)INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.1.mlp.linear_fc1.weight

_get_param_groups name module.module.decoder.layers.10.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.9.input_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.22.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.9.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.10.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.28.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.10.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)

INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.2.mlp.linear_fc1.weight
_get_param_groups name module.module.decoder.layers.23.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.9.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.9.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.11.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.28.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.10.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.2.self_attention.linear_proj.weight_get_param_groups name module.module.decoder.layers.9.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.9.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.23.self_attention.linear_proj.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.11.self_attention.linear_proj.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.9.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.28.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.9.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.2.self_attention.linear_qkv.weight_get_param_groups name module.module.decoder.layers.11.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.10.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.23.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.28.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.9.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.external_feature_model.vision_projection.encoder.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.external_feature_model.vision_projection.encoder.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.11.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.9.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.10.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.23.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.2.input_layernorm.weight


_get_param_groups name module.module.decoder.layers.9.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.28.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.11.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.9.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.11.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.9.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.28.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.23.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)INFO:megatron.core.distributed.param_and_grad_buffer:Params for bucket 37 (42287872 elements):

_get_param_groups name module.module.decoder.layers.11.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.10.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.10.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.28.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.11.self_attention.linear_proj.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.11.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.23.mlp.linear_fc1.weight key (1.0, 1.0, False, False)

INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.1.self_attention.linear_qkv.weight
_get_param_groups name module.module.decoder.layers.10.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.29.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.11.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.10.self_attention.linear_proj.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.12.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.23.mlp.linear_fc2.weight key (1.0, 1.0, False, False)

INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.1.input_layernorm.weight_get_param_groups name module.module.decoder.layers.10.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.11.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.10.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.29.self_attention.linear_proj.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.24.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.12.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.10.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.1.self_attention.linear_proj.weight

_get_param_groups name module.module.decoder.layers.10.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.29.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.11.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.24.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.12.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.29.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.10.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.0.pre_mlp_layernorm.weight

_get_param_groups name module.module.decoder.layers.11.mlp.linear_fc1.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.10.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.10.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.24.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.0.self_attention.linear_qkv.bias_get_param_groups name module.module.decoder.layers.12.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.11.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.10.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.29.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.10.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.24.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.10.mlp.linear_fc2.weight key (1.0, 1.0, False, False)INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.1.pre_mlp_layernorm.weight_get_param_groups name module.module.decoder.layers.12.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.29.mlp.linear_fc1.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.12.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.external_feature_model.vision_projection.encoder.linear_fc2.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.11.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.24.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.29.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.11.input_layernorm.weight key (0.0, 1.0, False, False)

INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.1.self_attention.linear_qkv.bias
_get_param_groups name module.module.decoder.layers.12.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.12.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.24.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.30.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.11.self_attention.linear_proj.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.11.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.external_feature_model.vision_projection.encoder.linear_fc2.weight key (1.0, 1.0, False, False)INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.0.mlp.linear_fc2.weight_get_param_groups name module.module.external_feature_model.pre_proj_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.12.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.12.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.24.mlp.linear_fc2.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.11.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.11.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.30.self_attention.linear_proj.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.25.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.external_feature_model.pre_proj_layernorm.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.11.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.13.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.12.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.0.mlp.linear_fc1.weight
_get_param_groups name module.module.decoder.layers.11.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.30.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.external_feature_model.pre_proj_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.25.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.30.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.13.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.12.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.embedding.word_embeddings.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.11.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.11.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.0.self_attention.linear_qkv.weight_get_param_groups name module.module.decoder.layers.25.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.external_feature_model.pre_proj_layernorm.bias key (0.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.13.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.30.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.12.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.11.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.11.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.25.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.13.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.0.self_attention.linear_proj.weight

_get_param_groups name module.module.decoder.layers.30.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.embedding.word_embeddings.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.0.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.12.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.11.mlp.linear_fc2.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.11.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.25.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.13.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.30.mlp.linear_fc2.weight key (1.0, 1.0, False, False)

INFO:megatron.core.distributed.param_and_grad_buffer:Params for bucket 38 (97326080 elements):
_get_param_groups name module.module.decoder.layers.13.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.12.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.12.input_layernorm.weight key (0.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.25.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.0.self_attention.linear_proj.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.0.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.13.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.31.input_layernorm.weight key (0.0, 1.0, False, False)


INFO:megatron.core.distributed.param_and_grad_buffer:    module.decoder.layers.0.input_layernorm.weight
_get_param_groups name module.module.decoder.layers.12.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.25.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.12.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.13.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.0.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.13.mlp.linear_fc2.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.31.self_attention.linear_proj.weight key (1.0, 1.0, False, False)INFO:megatron.core.distributed.param_and_grad_buffer:    module.embedding.word_embeddings.weight

_get_param_groups name module.module.decoder.layers.12.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.0.self_attention.linear_proj.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.26.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.12.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.13.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.0.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.14.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.31.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.12.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)

INFO:megatron.core.distributed.param_and_grad_buffer:Params for bucket 39 (1187840 elements):

_get_param_groups name module.module.decoder.layers.0.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.26.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.13.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.12.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.31.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.0.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.14.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
INFO:megatron.core.distributed.param_and_grad_buffer:    module.external_feature_model.pre_proj_layernorm.bias_get_param_groups name module.module.decoder.layers.12.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.0.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.26.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.13.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.12.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.0.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.14.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.31.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.26.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.12.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
INFO:megatron.core.distributed.param_and_grad_buffer:    module.external_feature_model.pre_proj_layernorm.weight
_get_param_groups name module.module.decoder.layers.13.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.12.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.14.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.0.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.31.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.0.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.12.mlp.linear_fc2.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.26.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.13.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
INFO:megatron.core.distributed.param_and_grad_buffer:    module.external_feature_model.vision_projection.encoder.linear_fc1.weight
_get_param_groups name module.module.decoder.layers.12.mlp.linear_fc2.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.31.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.14.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.1.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.13.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.0.mlp.linear_fc1.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.26.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.14.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.13.input_layernorm.weight key (0.0, 1.0, False, False)INFO:megatron.core.distributed.param_and_grad_buffer:    module.external_feature_model.vision_projection.encoder.linear_fc2.weight

_get_param_groups name module.module.decoder.layers.14.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.32.input_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.26.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.0.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.1.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.13.self_attention.linear_proj.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.14.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.14.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.13.self_attention.linear_proj.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.32.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.13.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.1.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.27.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.1.input_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.14.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.15.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.13.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.32.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.13.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.1.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.14.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.27.self_attention.linear_proj.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.1.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.13.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.32.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.15.self_attention.linear_proj.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.13.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.1.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.27.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.1.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.14.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.15.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.13.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.13.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.32.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.1.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.27.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.1.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.15.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.14.mlp.linear_fc1.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.13.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.13.mlp.linear_fc2.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.32.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.1.mlp.linear_fc2.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.14.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.27.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.1.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.15.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.13.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.32.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.14.input_layernorm.weight key (0.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.2.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.27.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.15.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.1.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.15.mlp.linear_fc1.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.14.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.33.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.14.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.2.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.27.mlp.linear_fc2.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.1.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.15.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.15.self_attention.linear_proj.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.14.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.14.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.33.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.28.input_layernorm.weight key (0.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.2.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.2.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.16.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.15.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.14.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.33.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.14.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.2.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.15.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.28.self_attention.linear_proj.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.2.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.16.self_attention.linear_proj.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.33.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.14.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.14.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.2.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.28.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.16.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.2.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.15.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.33.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.14.mlp.linear_fc1.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.28.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.14.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.2.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.16.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.2.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.14.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.33.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.15.mlp.linear_fc1.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.14.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.2.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.28.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.16.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.2.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.15.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.33.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.15.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.14.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.3.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.16.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.28.mlp.linear_fc1.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.2.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.15.self_attention.linear_proj.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.34.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.15.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.28.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.16.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.16.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.3.self_attention.linear_proj.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.15.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.2.mlp.linear_fc2.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.34.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.15.self_attention.linear_proj.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.29.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.17.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.15.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.3.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.3.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.16.self_attention.linear_proj.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.34.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.15.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.29.self_attention.linear_proj.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.17.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.3.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.16.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.34.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.3.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.15.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.29.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.15.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.17.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.3.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.15.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.16.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.29.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.3.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.17.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.34.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.15.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.3.mlp.linear_fc1.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.15.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.3.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.34.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.3.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.29.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.15.mlp.linear_fc1.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.17.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.16.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.16.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.34.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.3.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.15.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.4.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.29.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.16.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.17.mlp.linear_fc1.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.16.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.35.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.3.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.16.input_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.29.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.16.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.17.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.16.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.4.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.3.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.35.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.30.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.16.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.18.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.17.input_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.16.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.4.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.4.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.35.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.16.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.30.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.18.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.17.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.16.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.4.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.4.self_attention.linear_proj.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.35.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.16.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.30.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.18.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.16.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.17.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.4.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.30.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.4.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.18.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.35.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.16.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.16.mlp.linear_fc2.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.17.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.4.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.30.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.4.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.35.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.18.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.16.mlp.linear_fc1.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.17.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.30.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.17.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.4.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.4.mlp.linear_fc2.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.35.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.16.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.18.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.17.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.30.mlp.linear_fc2.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.17.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.4.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.5.input_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.36.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.18.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.17.input_layernorm.weight key (0.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.17.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.31.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.17.mlp.linear_fc2.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.4.mlp.linear_fc2.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.5.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.19.input_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.36.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.17.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.17.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.18.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.31.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.5.input_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.5.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.36.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.19.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.17.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.17.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.5.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.31.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.36.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.18.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.19.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.5.self_attention.linear_proj.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.17.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.17.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.31.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.5.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.19.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.18.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.36.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.5.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.17.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.17.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.5.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.31.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.18.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.19.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.36.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.5.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.18.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.5.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.17.mlp.linear_fc1.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.31.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.19.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.18.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.36.mlp.linear_fc2.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.17.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.6.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.31.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.5.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.18.self_attention.linear_proj.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.19.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.37.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.18.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.18.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.32.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.18.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.6.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.5.mlp.linear_fc1.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.18.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.20.input_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.37.self_attention.linear_proj.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.18.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.18.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.6.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.32.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.19.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.5.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.37.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.20.self_attention.linear_proj.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.18.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.6.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.32.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.18.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.6.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.37.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.19.self_attention.linear_proj.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.20.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.18.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.32.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.18.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.6.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.19.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.6.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.20.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.37.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.6.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.18.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.18.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.32.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.19.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.6.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.6.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.37.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.18.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.32.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.19.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.20.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.6.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.19.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.7.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.37.mlp.linear_fc2.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.32.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.18.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.20.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.19.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.19.mlp.linear_fc1.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.6.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.38.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.33.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.7.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.19.input_layernorm.weight key (0.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.20.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.19.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.19.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.6.mlp.linear_fc1.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.7.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.38.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.33.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.19.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.21.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.19.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.20.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.6.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.7.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.38.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.33.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.19.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.19.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.38.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.21.self_attention.linear_proj.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.20.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.33.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.7.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.19.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.7.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.19.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.21.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.38.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.20.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.7.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.19.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.33.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.7.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.19.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.21.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.20.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.38.mlp.linear_fc1.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.7.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.20.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.33.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.7.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.19.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.38.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.21.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.8.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.20.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.33.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.7.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.20.self_attention.linear_proj.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.19.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.21.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.39.input_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.8.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.20.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.20.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.34.input_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.20.input_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.7.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.21.mlp.linear_fc2.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.39.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.8.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.20.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.20.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.34.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.20.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.7.mlp.linear_fc1.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.22.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.8.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.39.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.21.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.20.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.20.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.7.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.34.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.39.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.22.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.20.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.20.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.8.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.21.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.34.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.8.input_layernorm.weight key (0.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.20.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.22.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.8.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.39.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.20.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.21.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.8.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.34.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.21.input_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.22.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.8.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.20.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.39.mlp.linear_fc1.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.21.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.34.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.8.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.21.self_attention.linear_proj.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.22.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.9.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.20.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.39.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.34.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.8.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.21.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.21.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.22.mlp.linear_fc1.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.9.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.40.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.21.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.35.input_layernorm.weight key (0.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.21.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.21.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.8.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.22.mlp.linear_fc2.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.9.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.40.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.35.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.21.self_attention.linear_proj.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.21.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.8.mlp.linear_fc1.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.23.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.9.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.21.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.35.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.40.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.21.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.8.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.22.input_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.21.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.35.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.23.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.9.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.40.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.21.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.9.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.21.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.22.self_attention.linear_proj.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.35.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.23.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.9.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.40.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.21.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.22.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.9.self_attention.linear_proj.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.22.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.35.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.9.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.23.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.40.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.21.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.22.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.9.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.22.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.35.mlp.linear_fc2.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.40.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.10.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.21.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.23.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.9.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.22.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.36.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.22.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.41.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.22.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.10.self_attention.linear_proj.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.23.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.22.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.9.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.36.self_attention.linear_proj.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.22.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.41.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.10.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.22.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.9.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.23.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.22.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.36.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.22.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.41.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.10.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.22.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.23.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.9.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.36.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.24.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.22.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.41.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.22.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.10.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.23.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.10.input_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.22.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.24.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.36.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.10.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.41.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.22.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.23.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.23.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.10.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.24.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.36.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.10.mlp.linear_fc2.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.23.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.41.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.22.mlp.linear_fc1.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.10.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.23.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.24.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.36.mlp.linear_fc2.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.11.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.22.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.41.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.10.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.23.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.23.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.37.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.24.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.23.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.11.self_attention.linear_proj.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.42.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.23.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.23.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.10.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.37.self_attention.linear_proj.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.11.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.23.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.23.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.42.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.24.mlp.linear_fc1.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.11.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.37.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.10.mlp.linear_fc1.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.23.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.23.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.24.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.42.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.37.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.24.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.10.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.11.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.23.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.23.mlp.linear_fc1.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.42.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.24.self_attention.linear_proj.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.37.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.23.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.11.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.11.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.25.input_layernorm.weight key (0.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.24.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.23.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.42.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.24.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.37.mlp.linear_fc1.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.11.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.24.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.11.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.25.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.23.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.12.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.24.self_attention.linear_proj.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.37.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.42.mlp.linear_fc1.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.11.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.23.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.12.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.24.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.24.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.25.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.42.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.38.input_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.12.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.11.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.24.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.25.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.24.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.24.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.43.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.12.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.38.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.11.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.24.self_attention.linear_proj.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.24.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.24.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.38.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.43.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.12.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.25.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.11.mlp.linear_fc1.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.25.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.24.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.24.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.38.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.43.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.11.mlp.linear_fc2.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.12.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.24.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.25.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.24.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.25.self_attention.linear_proj.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.43.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.38.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.12.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.12.input_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.25.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.25.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.24.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.25.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.13.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.38.mlp.linear_fc1.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.12.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.43.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.24.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.25.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.26.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.25.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.13.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.38.mlp.linear_fc2.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.12.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.43.mlp.linear_fc1.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.24.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.25.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.13.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.26.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.12.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.25.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.39.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.43.mlp.linear_fc2.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.25.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.25.input_layernorm.weight key (0.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.13.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.26.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.39.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.44.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.12.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.25.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.26.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.25.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.25.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.13.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.39.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.25.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.12.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.44.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.39.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.13.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.25.mlp.linear_fc1.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.26.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.25.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.26.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.12.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.13.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.25.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.44.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.25.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.39.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.26.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.13.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.14.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.26.self_attention.linear_proj.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.44.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.25.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.26.input_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.39.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.26.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.13.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.26.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.14.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.25.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.39.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.44.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.26.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.27.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.13.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.26.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.14.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.25.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.44.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.40.input_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.26.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.27.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.13.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.14.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.26.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.26.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.44.mlp.linear_fc2.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.40.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.26.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.27.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.14.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.13.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.26.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.40.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.26.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.45.input_layernorm.weight key (0.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.27.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.26.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.13.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.14.mlp.linear_fc1.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.26.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.40.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.26.mlp.linear_fc2.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.45.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.26.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.14.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.13.mlp.linear_fc2.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.27.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.26.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.27.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.45.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.40.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.26.mlp.linear_fc2.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.15.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.14.input_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.27.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.45.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.26.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.27.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.27.input_layernorm.weight key (0.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.40.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.15.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.14.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.27.mlp.linear_fc2.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.45.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.26.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.27.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.27.self_attention.linear_proj.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.40.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.14.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.15.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.28.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.45.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.26.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.27.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.14.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.27.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.41.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.15.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.45.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.28.self_attention.linear_proj.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.27.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.27.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.27.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.14.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.41.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.15.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.28.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.46.input_layernorm.weight key (0.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.27.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.27.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.27.mlp.linear_fc1.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.14.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.41.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.28.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.15.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.27.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.46.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.27.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.41.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.14.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.27.mlp.linear_fc1.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.15.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.27.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.28.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.46.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.28.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.27.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.15.input_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.41.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.16.input_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.46.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.28.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.27.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.28.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.28.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.41.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.15.self_attention.linear_proj.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.16.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.28.mlp.linear_fc2.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.27.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.28.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.46.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.28.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.41.mlp.linear_fc2.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.15.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.16.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.29.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.28.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.27.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.46.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.28.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.15.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.42.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.16.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.28.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.28.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.29.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.28.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.46.mlp.linear_fc2.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.15.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.16.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.42.self_attention.linear_proj.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.28.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.28.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.29.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.15.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.28.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.16.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.42.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.47.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.28.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.28.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.29.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.16.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.42.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.15.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.28.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.28.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.47.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.29.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.17.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.28.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.29.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.16.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.29.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.42.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.47.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.29.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.17.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.16.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.28.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.47.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.29.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.29.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.17.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.42.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.29.self_attention.linear_proj.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.29.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.16.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.47.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.29.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.28.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.17.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.29.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.42.mlp.linear_fc2.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.16.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.29.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.30.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.47.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.28.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.43.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.29.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.17.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.29.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.47.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.16.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.30.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.29.input_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.43.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.17.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.29.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.29.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.final_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.16.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.30.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.17.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.43.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.29.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.30.input_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.29.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.output_layer.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.30.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.16.mlp.linear_fc2.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.18.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.43.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.29.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.30.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.29.mlp.linear_fc2.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.17.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.30.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.29.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.18.self_attention.linear_proj.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.43.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.30.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.30.input_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.17.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.30.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.18.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.43.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.29.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.30.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.30.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.17.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.18.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.30.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.43.mlp.linear_fc2.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.29.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.30.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.30.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.17.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.31.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.44.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.18.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.29.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.30.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.30.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.17.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.18.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.30.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.44.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.31.self_attention.linear_proj.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.30.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.30.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.17.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.18.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.31.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.44.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.30.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.30.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.31.input_layernorm.weight key (0.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.17.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.19.input_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.31.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.30.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.44.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.30.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.31.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.18.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.19.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.31.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.31.input_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.30.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.44.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.31.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.19.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.18.self_attention.linear_proj.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.31.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.31.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.31.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.30.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.44.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.19.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.18.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.31.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.31.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.31.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.18.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.30.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.44.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.31.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.19.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.32.input_layernorm.weight key (0.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.31.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.30.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.18.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.45.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.19.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.31.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.31.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.32.self_attention.linear_proj.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.18.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.31.input_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.45.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.19.mlp.linear_fc2.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.31.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.32.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.32.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.18.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.45.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.20.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.31.self_attention.linear_proj.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.31.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.32.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.19.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.32.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.45.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.31.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.20.self_attention.linear_proj.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.32.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.32.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.19.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.20.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.32.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.45.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.31.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.32.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.32.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.19.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.20.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.32.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.45.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.32.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.32.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.19.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.31.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.20.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.45.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.32.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.32.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.31.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.33.input_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.19.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.20.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.46.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.32.mlp.linear_fc1.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.32.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.31.mlp.linear_fc2.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.33.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.19.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.20.mlp.linear_fc2.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.32.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.46.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.32.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.32.input_layernorm.weight key (0.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.33.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.19.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.21.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.33.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.46.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.32.mlp.linear_fc2.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.33.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.32.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.20.input_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.21.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.46.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.33.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.33.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.32.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.33.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.20.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.21.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.33.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.46.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.33.self_attention.linear_proj.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.33.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.32.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.20.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.21.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.33.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.33.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.46.mlp.linear_fc1.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.33.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.20.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.32.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.33.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.21.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.46.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.33.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.34.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.32.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.20.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.21.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.33.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.33.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.34.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.32.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.20.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.21.mlp.linear_fc2.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.47.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.33.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.33.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.34.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.33.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.20.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.22.input_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.34.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.47.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.34.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.33.mlp.linear_fc2.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.21.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.33.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.22.self_attention.linear_proj.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.47.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.34.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.34.self_attention.linear_proj.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.33.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.22.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.34.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.21.self_attention.linear_proj.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.34.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.22.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.34.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.47.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.33.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.21.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.34.mlp.linear_fc1.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.34.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.34.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.22.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.21.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.34.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.47.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.33.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.34.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.34.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.22.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.21.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.35.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.47.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.34.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.33.mlp.linear_fc1.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.22.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.34.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.21.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.47.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.34.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.33.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.35.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.34.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.23.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.21.mlp.linear_fc2.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.final_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.35.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.34.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.34.mlp.linear_fc2.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.35.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.22.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.23.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.output_layer.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.35.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.35.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.34.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.35.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.23.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.22.self_attention.linear_proj.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.35.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.34.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.35.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.23.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.35.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.22.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.35.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.34.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.35.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.23.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.22.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.35.mlp.linear_fc1.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.35.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.35.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.34.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.23.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.35.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.35.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.22.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.34.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.35.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.23.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.36.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.35.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.34.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.22.mlp.linear_fc1.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.35.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.24.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.36.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.36.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.35.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.22.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.35.mlp.linear_fc2.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.24.self_attention.linear_proj.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.23.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.36.self_attention.linear_proj.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.36.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.35.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.24.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.36.input_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.36.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.23.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.36.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.24.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.35.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.36.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.36.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.23.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.35.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.36.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.24.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.36.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.23.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.36.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.36.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.35.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.24.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.36.mlp.linear_fc1.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.36.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.23.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.35.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.36.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.24.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.36.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.36.mlp.linear_fc2.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.23.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.35.mlp.linear_fc2.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.36.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.25.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.37.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.37.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.23.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.36.input_layernorm.weight key (0.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.36.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.25.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.24.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.37.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.36.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.37.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.37.input_layernorm.weight key (0.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.25.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.36.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.37.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.37.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.25.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.24.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.37.self_attention.linear_proj.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.36.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.37.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.37.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.37.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.24.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.25.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.37.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.36.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.37.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.24.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.37.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.25.mlp.linear_fc1.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.37.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.36.mlp.linear_fc1.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.37.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.37.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.25.mlp.linear_fc2.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.24.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.37.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.36.mlp.linear_fc2.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.37.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.37.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.26.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.24.mlp.linear_fc1.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.38.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.37.input_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.37.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.38.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.24.mlp.linear_fc2.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.26.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.38.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.37.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.38.input_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.25.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.26.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.38.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.38.self_attention.linear_proj.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.37.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.38.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.26.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.25.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.38.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.38.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.37.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.38.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.26.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.25.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.38.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.38.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.38.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.37.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.26.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.25.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.38.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.38.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.26.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.37.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.38.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.25.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.38.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.38.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.37.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.38.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.27.input_layernorm.weight key (0.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.25.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.39.input_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.38.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.38.mlp.linear_fc2.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.38.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.25.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.27.self_attention.linear_proj.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.39.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.39.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.39.input_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.26.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.27.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.38.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.39.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.39.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.27.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.39.self_attention.linear_proj.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.26.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.39.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.38.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.39.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.39.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.27.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.38.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.26.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.39.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.39.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.27.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.39.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.26.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.38.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.39.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.27.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.39.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.38.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.39.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.26.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.39.mlp.linear_fc2.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.39.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.26.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.28.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.40.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.39.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.38.mlp.linear_fc2.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.39.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.26.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.39.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.39.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.28.self_attention.linear_proj.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.40.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.40.input_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.27.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.40.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.28.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.39.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.40.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.40.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.27.self_attention.linear_proj.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.28.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.40.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.39.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.40.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.27.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.40.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.40.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.28.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.39.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.40.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.27.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.40.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.40.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.28.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.39.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.40.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.27.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.40.mlp.linear_fc1.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.28.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.40.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.39.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.40.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.40.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.27.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.40.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.29.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.39.mlp.linear_fc2.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.40.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.41.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.40.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.27.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.40.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.29.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.41.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.41.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.41.input_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.28.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.29.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.40.self_attention.linear_proj.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.41.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.41.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.29.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.41.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.40.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.41.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.28.self_attention.linear_proj.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.41.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.41.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.41.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.40.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.28.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.29.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.41.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.41.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.28.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.29.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.41.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.40.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.41.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.41.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.29.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.40.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.41.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.28.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.41.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.41.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.30.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.40.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.41.mlp.linear_fc2.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.28.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.42.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.41.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.41.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.28.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.42.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.30.self_attention.linear_proj.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.42.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.42.input_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.30.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.29.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.41.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.42.self_attention.linear_proj.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.42.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.42.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.30.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.41.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.42.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.42.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.29.self_attention.linear_proj.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.42.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.41.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.42.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.30.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.29.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.42.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.42.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.30.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.29.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.41.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.42.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.42.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.42.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.30.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.41.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.42.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.29.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.42.mlp.linear_fc1.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.42.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.41.mlp.linear_fc2.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.31.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.43.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.29.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.42.input_layernorm.weight key (0.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.42.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.42.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.29.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.43.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.31.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.42.self_attention.linear_proj.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.43.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.30.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.43.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.43.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.42.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.31.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.42.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.31.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.43.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.43.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.30.self_attention.linear_proj.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.43.self_attention.linear_proj.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.43.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.30.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.43.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.43.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.31.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.42.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.43.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.43.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.30.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.43.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.31.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.42.mlp.linear_fc1.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.43.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.43.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.31.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.42.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.43.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.30.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.43.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.43.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.30.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.32.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.43.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.44.input_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.43.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.43.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.30.mlp.linear_fc2.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.32.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.44.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.44.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.43.self_attention.linear_proj.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.31.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.44.input_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.32.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.44.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.43.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.44.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.32.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.31.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.44.self_attention.linear_proj.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.44.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.43.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.44.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.44.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.31.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.32.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.31.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.44.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.44.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.32.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.44.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.43.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.31.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.44.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.44.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.32.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.44.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.43.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.31.mlp.linear_fc1.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.44.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.44.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.33.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.31.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.44.mlp.linear_fc1.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.43.mlp.linear_fc2.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.44.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.45.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.44.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.32.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.44.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.33.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.45.input_layernorm.weight key (0.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.45.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.45.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.33.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.45.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.45.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.32.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.44.self_attention.linear_proj.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.33.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.45.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.45.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.45.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.44.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.32.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.45.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.45.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.33.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.44.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.32.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.45.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.45.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.33.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.45.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.45.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.44.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.32.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.45.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.44.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.33.mlp.linear_fc2.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.45.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.45.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.32.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.45.mlp.linear_fc1.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.34.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.46.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.45.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.44.mlp.linear_fc2.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.32.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.45.mlp.linear_fc2.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.46.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.33.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.46.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.45.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.34.self_attention.linear_proj.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.46.self_attention.linear_proj.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.46.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.33.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.46.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.34.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.46.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.45.self_attention.linear_proj.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.46.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.33.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.34.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.46.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.46.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.46.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.45.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.33.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.46.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.34.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.45.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.46.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.46.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.33.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.46.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.34.mlp.linear_fc1.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.46.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.45.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.46.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.33.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.34.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.46.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.46.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.45.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.46.mlp.linear_fc2.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.33.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.35.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.46.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.45.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.34.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.35.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.47.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.46.input_layernorm.weight key (0.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.47.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.34.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.47.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.35.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.46.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.34.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.47.self_attention.linear_proj.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.47.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.47.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.35.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.46.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.34.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.47.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.47.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.47.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.46.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.35.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.47.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.47.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.34.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.47.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.35.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.46.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.34.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.47.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.35.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.46.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.47.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.47.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.34.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.47.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.36.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.46.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.47.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.47.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.35.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.47.mlp.linear_fc2.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.36.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.47.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.47.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.final_layernorm.weight key (0.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.35.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.36.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.47.input_layernorm.weight key (0.0, 1.0, False, False)


_get_param_groups name module.module.decoder.final_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.final_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.output_layer.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.36.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.35.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.47.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.output_layer.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.output_layer.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.35.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.47.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.36.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.47.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.35.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.36.mlp.linear_fc1.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.47.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.36.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.35.mlp.linear_fc1.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.47.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.35.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.37.input_layernorm.weight key (0.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.47.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.36.input_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.37.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.final_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.37.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.36.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.37.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.output_layer.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.36.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.36.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.37.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.37.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.36.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.37.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.36.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.38.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.36.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.38.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.37.input_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.38.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.37.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.38.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.37.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.37.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.38.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.38.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.37.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.38.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.37.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.39.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.37.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.38.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.39.self_attention.linear_proj.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.39.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.38.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.39.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.38.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.38.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.39.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.39.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.38.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.39.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.38.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.40.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.38.mlp.linear_fc2.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.39.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.40.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.40.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.39.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.40.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.39.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.39.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.40.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.40.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.39.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.40.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.39.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.41.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.39.mlp.linear_fc2.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.40.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.41.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.41.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.40.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.41.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.40.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.40.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.41.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.41.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.40.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.41.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.40.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.42.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.40.mlp.linear_fc2.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.41.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.42.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.42.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.41.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.42.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.41.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.41.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.42.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.42.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.41.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.42.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.41.mlp.linear_fc1.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.41.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.43.input_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.42.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.43.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.42.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.43.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.42.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.43.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.42.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.43.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.42.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.43.mlp.linear_fc1.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.43.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.42.mlp.linear_fc1.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.42.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.44.input_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.43.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.44.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.44.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.43.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.44.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.43.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.43.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.44.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.44.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.external_feature_model.vision_projection.encoder.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.43.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.44.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.43.mlp.linear_fc1.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.45.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.43.mlp.linear_fc2.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.44.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.45.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.45.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.44.self_attention.linear_proj.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.45.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.44.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.44.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.45.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.45.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.44.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.45.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.44.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.46.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.44.mlp.linear_fc2.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.45.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.46.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.46.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.45.self_attention.linear_proj.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.external_feature_model.vision_projection.encoder.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.46.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.45.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.45.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.46.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.external_feature_model.pre_proj_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.45.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.46.mlp.linear_fc1.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.45.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.external_feature_model.pre_proj_layernorm.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.46.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.45.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.embedding.word_embeddings.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.46.input_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.47.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.46.self_attention.linear_proj.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.0.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.46.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.47.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.46.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.0.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.47.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.46.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.47.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.0.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.46.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.0.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.47.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.46.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.47.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.0.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.47.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.0.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.47.input_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.final_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.0.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.output_layer.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.47.self_attention.linear_proj.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.1.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.47.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.47.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.1.self_attention.linear_proj.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.1.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.47.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.1.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.47.mlp.linear_fc1.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.47.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.1.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.final_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.1.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.output_layer.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.1.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.2.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.2.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.2.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.2.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.2.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.2.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.2.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.3.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.3.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.3.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.3.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.3.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.3.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.3.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.4.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.4.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.4.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.4.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.4.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.4.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.4.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.5.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.5.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.5.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.5.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.5.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.5.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.5.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.6.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.6.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.6.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.6.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.6.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.6.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.6.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.7.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.7.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.7.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.7.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.7.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.7.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.7.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.8.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.8.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.8.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.8.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.8.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.8.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.8.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.9.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.9.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.9.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.9.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.9.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.9.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.9.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.10.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.10.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.10.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.10.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.10.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.10.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.10.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.11.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.11.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.11.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.11.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.11.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.11.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.11.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.12.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.12.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.12.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.12.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.12.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.12.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.12.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.13.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.13.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.13.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.13.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.13.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.13.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.13.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.14.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.14.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.14.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.14.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.14.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.14.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.14.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.15.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.15.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.15.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.15.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.15.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.15.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.15.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.16.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.16.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.16.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.16.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.16.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.16.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.16.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.17.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.17.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.17.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.17.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.17.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.17.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.17.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.18.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.18.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.18.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.18.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.18.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.18.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.external_feature_model.vision_projection.encoder.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.18.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.19.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.19.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.19.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.19.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.19.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.19.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.19.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.external_feature_model.vision_projection.encoder.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.20.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.20.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.20.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.20.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.external_feature_model.vision_projection.encoder.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.20.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.20.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.20.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.external_feature_model.pre_proj_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.21.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.external_feature_model.pre_proj_layernorm.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.21.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.embedding.word_embeddings.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.21.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.external_feature_model.vision_projection.encoder.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.0.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.21.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.21.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.0.self_attention.linear_proj.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.external_feature_model.pre_proj_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.21.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.0.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.external_feature_model.pre_proj_layernorm.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.21.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.0.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)

_get_param_groups name module.module.embedding.word_embeddings.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.22.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.0.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.22.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.0.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.0.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.22.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.0.mlp.linear_fc2.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.0.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.22.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.1.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.0.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.22.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.0.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.1.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.1.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.0.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.22.mlp.linear_fc1.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.1.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.22.mlp.linear_fc2.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.0.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.23.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.0.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.1.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.1.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.1.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.23.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.1.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.23.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.1.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.1.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.23.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.2.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.1.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.23.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.2.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.1.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.23.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.2.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.1.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.23.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.2.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.1.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.24.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.2.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.2.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.24.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.2.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.2.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.24.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.2.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.2.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.24.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.3.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.2.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.24.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.3.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.2.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.24.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.2.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.3.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.2.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.24.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.3.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.3.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.25.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.3.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.3.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.25.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.3.mlp.linear_fc1.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.3.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.25.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.3.mlp.linear_fc2.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.3.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.25.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.4.input_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.3.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.3.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.25.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.4.self_attention.linear_proj.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.3.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.25.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.4.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.4.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.4.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.25.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.4.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.26.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.4.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.4.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.4.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.26.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.4.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.4.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.26.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.4.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.5.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.26.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.4.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.5.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.26.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.4.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.5.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.26.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.5.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.5.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.26.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.5.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.5.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.27.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.external_feature_model.vision_projection.encoder.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.5.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.5.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.5.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.27.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.5.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.27.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.5.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.6.input_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.27.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.5.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.6.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.5.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.27.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.6.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.6.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.6.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.27.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.27.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.6.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.6.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.6.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.28.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.6.mlp.linear_fc1.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.6.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.6.mlp.linear_fc2.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.28.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.7.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.28.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.6.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.7.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.28.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.6.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.7.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.6.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.external_feature_model.vision_projection.encoder.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.28.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.7.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.7.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.28.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.7.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.7.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.external_feature_model.pre_proj_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.28.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.7.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.7.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.external_feature_model.pre_proj_layernorm.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.29.input_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.7.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.7.mlp.linear_fc2.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.29.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.8.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.embedding.word_embeddings.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.7.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.29.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.7.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.8.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.0.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.29.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.7.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.8.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.29.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.0.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.8.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.8.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.29.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.0.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.8.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.8.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.29.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.0.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.8.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.8.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.30.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.8.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.0.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.8.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.30.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.9.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.0.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.8.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.30.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.9.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.0.mlp.linear_fc2.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.8.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.30.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.9.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.8.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.1.input_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.30.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.9.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.9.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.1.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.30.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.9.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.9.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.1.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.30.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.9.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.9.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.1.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.31.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.9.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.9.mlp.linear_fc2.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.1.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.31.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.10.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.9.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.1.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.31.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.10.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.9.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.31.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.1.mlp.linear_fc2.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.10.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.9.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.2.input_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.31.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.10.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.10.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.2.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.31.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.10.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.2.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.31.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.10.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.2.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.10.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.32.input_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.10.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.10.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.10.mlp.linear_fc2.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.2.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.32.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.11.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.2.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.10.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.32.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.11.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.32.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.2.mlp.linear_fc2.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.10.mlp.linear_fc1.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.11.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.10.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.3.input_layernorm.weight key (0.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.32.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.11.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.11.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.32.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.3.self_attention.linear_proj.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.11.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.11.self_attention.linear_proj.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.32.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.3.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.11.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.11.mlp.linear_fc1.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.3.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.33.input_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.11.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.11.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.3.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.33.self_attention.linear_proj.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.12.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.11.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.33.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.3.mlp.linear_fc1.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.11.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.12.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.33.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.3.mlp.linear_fc2.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.11.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.12.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.4.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.33.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.12.input_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.12.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.4.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.33.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.12.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.12.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.4.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.33.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.12.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.12.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.4.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.12.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.34.input_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.12.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.4.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.12.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.34.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.13.input_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.4.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.12.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.34.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.4.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.13.self_attention.linear_proj.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.12.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.34.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.13.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.5.input_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.13.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.13.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.34.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.5.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.13.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.13.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.34.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.5.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.13.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.13.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.5.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.34.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.13.mlp.linear_fc1.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.35.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.13.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.13.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.5.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.13.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.14.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.5.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.35.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.13.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.5.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.14.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.35.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.14.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.6.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.35.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.14.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.14.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.6.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.14.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.35.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.14.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.6.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.14.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.35.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.14.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.6.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.35.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.14.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.14.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.6.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.36.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.14.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.14.mlp.linear_fc1.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.6.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.15.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.14.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.6.mlp.linear_fc2.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.36.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.15.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.7.input_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.36.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.15.self_attention.linear_proj.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.15.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.36.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.15.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.7.self_attention.linear_proj.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.15.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.15.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.7.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.36.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.15.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.7.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.15.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.36.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.15.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.7.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.15.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.36.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.15.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.7.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.15.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.37.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.15.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.7.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.16.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.37.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.16.input_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.8.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.16.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.37.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.16.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.8.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.16.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.37.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.16.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.8.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.16.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.16.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.8.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.37.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.16.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.8.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.16.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.37.mlp.linear_fc1.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.16.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.8.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.37.mlp.linear_fc2.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.16.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.8.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.38.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.16.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.16.mlp.linear_fc2.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.external_feature_model.vision_projection.encoder.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.17.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.9.input_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.17.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.38.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.38.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.9.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.17.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.17.self_attention.linear_proj.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.38.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.9.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.17.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.17.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.9.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.17.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.17.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.38.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.38.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.17.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.9.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.17.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.38.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.17.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.9.mlp.linear_fc1.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.17.mlp.linear_fc1.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.39.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.17.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.9.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.17.mlp.linear_fc2.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.18.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.10.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.39.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.18.input_layernorm.weight key (0.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.39.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.18.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.10.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.18.self_attention.linear_proj.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.39.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.18.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.10.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.18.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.external_feature_model.vision_projection.encoder.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.18.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.10.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.18.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.39.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.18.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.10.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.39.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.18.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)


_get_param_groups name module.module.external_feature_model.pre_proj_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.18.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.10.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.18.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.39.mlp.linear_fc2.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.18.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.external_feature_model.pre_proj_layernorm.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.10.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.18.mlp.linear_fc2.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.40.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.19.input_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.19.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.11.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.embedding.word_embeddings.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.19.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.40.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.11.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.19.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.19.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.40.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.0.input_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.19.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.11.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.19.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.40.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.0.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.11.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.19.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.19.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.40.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.0.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.11.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.19.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.19.mlp.linear_fc1.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.40.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.0.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.11.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.19.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.19.mlp.linear_fc1.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.40.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.11.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.0.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.20.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.19.mlp.linear_fc2.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.41.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.12.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.20.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.0.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.20.self_attention.linear_proj.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.41.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.12.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.0.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.20.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.41.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.20.self_attention.linear_proj.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.20.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.12.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.1.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.41.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.20.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.12.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.20.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.20.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.1.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.41.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.12.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.20.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.1.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.41.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.20.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.12.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.20.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.1.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.41.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.20.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.12.mlp.linear_fc2.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.21.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.1.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.42.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.20.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.13.input_layernorm.weight key (0.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.21.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.21.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.1.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.21.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.13.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.42.self_attention.linear_proj.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.1.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.21.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.13.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.42.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.21.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.2.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.13.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.42.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.21.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.21.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.21.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.21.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.2.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.13.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.42.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.21.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.13.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.42.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.2.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.21.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.22.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.13.mlp.linear_fc2.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.42.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.2.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.21.mlp.linear_fc1.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.14.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.22.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.21.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.43.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.2.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.22.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.22.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.14.self_attention.linear_proj.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.43.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.2.mlp.linear_fc1.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.22.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.14.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.22.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.2.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.43.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.14.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.22.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.22.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.3.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.43.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.22.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.22.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.14.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.3.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.22.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.14.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.43.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.3.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.22.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.14.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.23.input_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.3.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.43.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.22.mlp.linear_fc1.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.15.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.23.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.22.mlp.linear_fc2.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.3.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.43.mlp.linear_fc2.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.23.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.15.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.23.input_layernorm.weight key (0.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.3.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.44.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.23.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.15.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.3.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.23.self_attention.linear_proj.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.44.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.15.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.23.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.4.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.23.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.44.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.23.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.15.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.23.mlp.linear_fc1.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.44.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.4.self_attention.linear_proj.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.23.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.15.mlp.linear_fc1.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.23.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.4.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.44.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.15.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.24.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.23.mlp.linear_fc1.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.4.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.44.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.16.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.23.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.24.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.4.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.44.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.24.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.24.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.16.self_attention.linear_proj.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.4.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.45.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.24.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.16.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.4.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.24.self_attention.linear_proj.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.16.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.45.self_attention.linear_proj.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.24.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.5.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.24.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.45.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.24.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.16.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.24.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.5.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.45.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.24.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.16.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.5.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.24.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.25.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.5.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.16.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.45.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.24.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.25.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.17.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.45.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.24.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.5.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.25.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.25.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.45.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.17.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.5.mlp.linear_fc1.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.25.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.25.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.5.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.17.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.46.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.25.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.25.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.6.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.17.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.25.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.25.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.46.self_attention.linear_proj.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.6.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.17.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.46.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.25.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.25.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.17.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.6.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.46.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.6.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.17.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.25.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.26.input_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.18.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.25.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.6.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.26.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.46.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.6.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.26.input_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.26.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.18.self_attention.linear_proj.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.46.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.6.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.26.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.26.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.18.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.46.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.7.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.18.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.26.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.26.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.26.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.26.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.7.self_attention.linear_proj.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.18.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.26.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.7.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.26.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.18.mlp.linear_fc1.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.7.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.27.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.18.mlp.linear_fc2.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.26.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.19.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.26.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.7.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.27.self_attention.linear_proj.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.47.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.27.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.7.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.27.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.19.self_attention.linear_proj.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.27.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.19.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.7.mlp.linear_fc2.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.27.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.47.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.19.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.27.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.8.input_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.27.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.47.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.27.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.27.mlp.linear_fc1.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.8.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.19.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.27.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.8.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.19.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.27.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.47.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.8.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.19.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.28.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.27.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.20.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.8.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.28.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.27.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.8.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.20.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.28.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.28.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.28.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.8.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.20.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.28.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.20.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.9.input_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.28.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.28.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.28.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.20.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.9.self_attention.linear_proj.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.28.mlp.linear_fc1.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.9.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.20.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.28.mlp.linear_fc2.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.28.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.9.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.20.mlp.linear_fc2.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.29.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.28.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.21.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.28.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.9.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.29.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.47.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.29.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.21.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.29.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.9.mlp.linear_fc1.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.47.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.29.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.9.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.21.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.29.self_attention.linear_proj.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.21.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.29.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.10.input_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.29.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.47.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.29.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.29.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.21.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.10.self_attention.linear_proj.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.final_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.29.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.21.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.10.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.29.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.output_layer.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.10.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.21.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.30.input_layernorm.weight key (0.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.29.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.22.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.29.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.10.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.30.self_attention.linear_proj.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.22.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.30.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.30.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.10.mlp.linear_fc1.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.30.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.22.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.10.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.30.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.22.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.11.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.30.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.30.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.22.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.30.mlp.linear_fc1.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.30.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.11.self_attention.linear_proj.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.30.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.22.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.11.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.30.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.11.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.31.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.22.mlp.linear_fc2.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.30.mlp.linear_fc1.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.23.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.30.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.11.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.31.self_attention.linear_proj.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.11.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.31.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.31.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.23.self_attention.linear_proj.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.31.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.11.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.23.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.31.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.12.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.23.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.31.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.31.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.31.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.31.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.12.self_attention.linear_proj.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.23.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.31.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.12.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.23.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.31.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.12.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.32.input_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.23.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.31.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.24.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.31.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.32.self_attention.linear_proj.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.12.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.32.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.32.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.12.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.24.self_attention.linear_proj.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.32.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.12.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.24.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.32.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.24.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.13.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.32.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.32.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.24.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.32.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.13.self_attention.linear_proj.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.32.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.13.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.24.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.32.mlp.linear_fc2.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.32.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.13.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.24.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.33.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.32.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.25.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.13.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.32.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.33.self_attention.linear_proj.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.13.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.25.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.33.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.33.input_layernorm.weight key (0.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.33.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.13.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.25.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.33.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.25.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.14.input_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.33.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.33.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.33.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.33.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.25.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.14.self_attention.linear_proj.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.25.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.33.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.14.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.33.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.14.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.25.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.34.input_layernorm.weight key (0.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.33.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.33.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.26.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.34.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.14.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.34.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.14.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.34.input_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.26.self_attention.linear_proj.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.34.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.14.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.26.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.34.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.15.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.26.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.34.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.34.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.15.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.34.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.26.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.34.mlp.linear_fc1.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.15.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.26.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.34.mlp.linear_fc2.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.34.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.15.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.35.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.26.mlp.linear_fc2.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.34.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.27.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.15.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.35.self_attention.linear_proj.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.34.mlp.linear_fc2.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.15.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.35.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.27.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.35.input_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.35.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.15.mlp.linear_fc2.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.27.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.35.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.27.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.16.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.35.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.35.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.27.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.16.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.35.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.35.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.35.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.27.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.16.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.35.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.16.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.27.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.36.input_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.35.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.28.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.16.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.36.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.35.mlp.linear_fc2.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.16.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.28.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.36.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.36.input_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.16.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.36.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.28.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.36.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.28.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.17.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.36.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.36.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.17.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.36.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.28.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.36.mlp.linear_fc1.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.28.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.17.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.36.mlp.linear_fc2.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.36.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.17.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.28.mlp.linear_fc2.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.37.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.36.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.29.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.37.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.36.mlp.linear_fc2.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.17.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.37.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.37.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.17.mlp.linear_fc1.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.29.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.37.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.17.mlp.linear_fc2.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.29.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.37.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.29.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.18.input_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.37.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.37.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.18.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.37.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.29.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.18.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.37.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.37.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.29.mlp.linear_fc1.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.18.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.38.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.29.mlp.linear_fc2.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.37.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.30.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.18.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.38.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.38.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.18.mlp.linear_fc1.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.30.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.37.mlp.linear_fc1.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.38.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.18.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.30.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.37.mlp.linear_fc2.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.30.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.19.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.38.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.38.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.38.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.19.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.30.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.38.self_attention.linear_proj.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.38.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.19.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.30.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.38.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.39.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.19.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.30.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.38.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.39.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.31.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.19.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.38.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.39.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.19.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.31.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.38.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.39.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.19.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.31.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.38.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.39.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.31.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.20.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.39.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.39.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.31.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.20.self_attention.linear_proj.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.39.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.39.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.31.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.20.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.39.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.40.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.20.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.39.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.31.mlp.linear_fc2.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.40.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.32.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.39.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.20.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.40.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.32.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.39.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.40.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.20.mlp.linear_fc1.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.39.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.32.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.20.mlp.linear_fc2.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.40.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.32.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.40.input_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.21.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.40.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.32.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.40.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.40.mlp.linear_fc2.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.21.self_attention.linear_proj.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.32.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.40.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.41.input_layernorm.weight key (0.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.21.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.40.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.32.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.21.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.41.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.33.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.41.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.40.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.21.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.41.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.40.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.21.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.33.self_attention.linear_proj.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.40.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.21.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.33.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.41.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.33.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.41.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.22.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.41.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.41.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.33.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.41.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.22.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.42.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.33.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.41.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.22.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.22.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.41.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.33.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.42.self_attention.linear_proj.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.42.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.34.input_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.22.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.41.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.42.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.22.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.34.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.41.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.42.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.22.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.34.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.41.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.34.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.42.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.23.input_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.42.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.42.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.34.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.23.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.42.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.43.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.34.mlp.linear_fc1.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.42.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.23.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.34.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.42.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.23.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.43.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.35.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.43.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.42.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.23.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.43.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.23.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.35.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.42.mlp.linear_fc1.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.43.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.23.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.42.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.35.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.43.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.35.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.43.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.24.input_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.43.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.35.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.24.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.43.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.44.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.35.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.24.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.43.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.44.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.24.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.35.mlp.linear_fc2.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.43.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.36.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.44.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.24.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.44.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.43.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.24.mlp.linear_fc1.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.36.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.24.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.36.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.44.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.36.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.25.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.44.mlp.linear_fc1.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.44.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.25.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.36.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.45.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.25.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.36.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.25.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.36.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.45.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.37.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.25.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.45.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.45.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.37.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.25.mlp.linear_fc1.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.37.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.25.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.45.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.43.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.37.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.26.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.45.mlp.linear_fc1.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.45.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.43.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.37.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.26.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.46.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.37.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.26.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.44.input_layernorm.weight key (0.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.26.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.37.mlp.linear_fc2.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.46.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.44.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.38.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.46.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.26.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.44.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.46.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.26.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.38.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.44.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.26.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.38.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.46.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.44.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.38.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.27.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.46.mlp.linear_fc1.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.44.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.38.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.46.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.27.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.44.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.38.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.27.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.38.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.45.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.27.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.47.input_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.39.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.27.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.47.self_attention.linear_proj.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.45.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.47.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.39.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.27.mlp.linear_fc1.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.45.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.47.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.39.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.27.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.45.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.39.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.47.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.28.input_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.45.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.47.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.39.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.28.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.45.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.47.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.28.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.39.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.45.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.28.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.39.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.final_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.46.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.output_layer.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.40.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.28.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.46.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.28.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.40.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.46.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.28.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.46.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.40.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.29.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.40.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.46.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.29.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.40.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.46.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.29.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.40.mlp.linear_fc1.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.46.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.29.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.40.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.41.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.29.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.47.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.29.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.41.self_attention.linear_proj.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.47.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.29.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.41.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.47.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.30.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.41.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.47.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.30.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.41.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.47.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.30.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.41.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.47.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.30.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.41.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.47.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.30.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.42.input_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.final_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.30.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.42.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.output_layer.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.30.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.42.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.31.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.42.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.31.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.42.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.31.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.42.mlp.linear_fc1.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.31.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.42.mlp.linear_fc2.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.43.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.31.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.31.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.43.self_attention.linear_proj.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.31.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.43.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.43.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.32.input_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.43.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.32.self_attention.linear_proj.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.43.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.32.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.32.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.43.mlp.linear_fc2.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.44.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.32.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.32.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.44.self_attention.linear_proj.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.44.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.32.mlp.linear_fc2.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.44.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.33.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.44.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.33.self_attention.linear_proj.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.44.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.33.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.33.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.44.mlp.linear_fc2.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.45.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.33.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.33.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.45.self_attention.linear_proj.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.45.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.33.mlp.linear_fc2.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.45.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.34.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.45.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.34.self_attention.linear_proj.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.34.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.45.mlp.linear_fc1.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.34.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.45.mlp.linear_fc2.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.46.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.34.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.46.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.34.mlp.linear_fc1.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.46.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.34.mlp.linear_fc2.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.46.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.35.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.46.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.35.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.46.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.35.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.35.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.46.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.35.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.47.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.35.mlp.linear_fc1.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.35.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.47.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.36.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.47.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.47.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.36.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.36.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.47.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.36.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.47.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.47.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.36.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.final_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.36.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.36.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.output_layer.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.37.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.37.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.37.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.37.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.37.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.37.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.37.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.38.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.external_feature_model.vision_projection.encoder.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.38.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.38.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.38.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.38.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.38.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.38.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.39.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.39.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.39.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.39.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.39.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.external_feature_model.vision_projection.encoder.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.39.mlp.linear_fc1.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.39.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.40.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.external_feature_model.pre_proj_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.40.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.external_feature_model.pre_proj_layernorm.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.40.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.embedding.word_embeddings.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.40.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.40.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.0.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.40.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.0.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.40.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.0.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.41.input_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.0.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.41.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.41.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.0.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.41.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.0.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.41.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.0.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.41.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.1.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.41.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.1.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.42.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.1.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.42.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.1.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.42.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.1.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.42.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.1.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.42.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.1.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.42.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.2.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.42.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.2.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.43.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.2.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.43.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.2.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.43.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.43.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.2.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.43.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.2.mlp.linear_fc1.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.2.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.43.mlp.linear_fc1.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.43.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.3.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.44.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.3.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.44.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.3.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.3.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.44.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.44.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.3.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.44.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.3.mlp.linear_fc1.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.3.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.44.mlp.linear_fc1.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.44.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.4.input_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.45.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.4.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.4.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.45.self_attention.linear_proj.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.4.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.45.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.45.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.4.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.4.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.45.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.4.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.45.mlp.linear_fc1.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.45.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.5.input_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.46.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.5.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.5.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.46.self_attention.linear_proj.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.5.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.46.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.46.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.5.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.5.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.46.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)

INFO:megatron.core.optimizer:Setting up optimizer with OptimizerConfig(optimizer='adam', lr=5e-06, min_lr=1e-07, decoupled_lr=None, decoupled_min_lr=None, weight_decay=0.0, fp16=False, bf16=True, params_dtype=torch.bfloat16, loss_scale=None, initial_loss_scale=4096.0, min_loss_scale=1.0, loss_scale_window=1000, hysteresis=2, adam_beta1=0.9, adam_beta2=0.999, adam_eps=1e-08, sgd_momentum=0.9, use_distributed_optimizer=True, overlap_grad_reduce=True, overlap_param_gather=False, clip_grad=1.0, log_num_zeros_in_grad=False, barrier_with_L1_time=True, timers=<megatron.core.timers.Timers object at 0x7fb26878a880>)_get_param_groups name module.module.decoder.layers.5.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.46.mlp.linear_fc1.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.46.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.6.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.6.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.47.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.6.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.6.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.47.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.6.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.47.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.6.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.47.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.6.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.47.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.7.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.47.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.7.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.47.mlp.linear_fc2.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.7.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.final_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.7.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.output_layer.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.7.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.7.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.7.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.8.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.8.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.8.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.8.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.8.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.8.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.8.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.9.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.9.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.9.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.9.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.9.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.9.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.9.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.10.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.10.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.10.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.10.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.10.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.10.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.10.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.11.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.11.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.11.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.11.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.11.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.11.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.11.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.12.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.12.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.12.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.12.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.12.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.12.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.12.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.13.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.13.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.13.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.13.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.13.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.13.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.13.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.14.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.14.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.14.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.14.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.14.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.14.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.14.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.15.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.external_feature_model.vision_projection.encoder.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.15.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.15.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.15.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.15.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.15.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.external_feature_model.vision_projection.encoder.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.15.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.16.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.external_feature_model.vision_projection.encoder.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.16.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.16.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.external_feature_model.vision_projection.encoder.linear_fc2.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.16.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.external_feature_model.pre_proj_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.16.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.external_feature_model.pre_proj_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.16.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.external_feature_model.pre_proj_layernorm.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.16.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.external_feature_model.pre_proj_layernorm.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.17.input_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.17.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.embedding.word_embeddings.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.17.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.embedding.word_embeddings.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.17.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.0.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.17.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.17.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.0.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.17.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.0.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.18.input_layernorm.weight key (0.0, 1.0, False, False)
INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.0.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.18.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.0.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.18.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.18.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.0.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.18.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.0.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.18.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.18.mlp.linear_fc2.weight key (1.0, 1.0, False, False)INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.0.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.19.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.0.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.19.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.19.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.0.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.19.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.0.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.19.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.0.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.19.mlp.linear_fc1.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.19.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.0.mlp.linear_fc2.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.20.input_layernorm.weight key (0.0, 1.0, False, False)
INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.0.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.20.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.1.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.20.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.20.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.1.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.20.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.1.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.20.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.20.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.1.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.21.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.1.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.21.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.21.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.1.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.21.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.1.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.21.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.1.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.21.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.21.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.1.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.22.input_layernorm.weight key (0.0, 1.0, False, False)
INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.1.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.22.self_attention.linear_proj.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.1.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.22.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.22.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.1.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.22.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.1.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.22.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.1.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.22.mlp.linear_fc2.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.23.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.2.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.23.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.2.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.23.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.2.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.23.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.23.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.2.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.23.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.2.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.23.mlp.linear_fc2.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.24.input_layernorm.weight key (0.0, 1.0, False, False)
INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.2.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.24.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.2.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.24.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.24.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.2.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.24.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.2.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.24.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.2.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.24.mlp.linear_fc2.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.25.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.2.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.25.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.2.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.25.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.2.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.25.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.2.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.25.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.3.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.25.mlp.linear_fc1.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.25.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.3.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.26.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.3.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.26.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.26.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.3.self_attention.linear_proj.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.26.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.3.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.26.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.3.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.26.mlp.linear_fc1.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.3.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.26.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.3.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.27.input_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.27.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.3.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.27.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.3.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.27.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.3.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.27.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.3.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.3.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.27.mlp.linear_fc1.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.27.mlp.linear_fc2.weight key (1.0, 1.0, False, False)INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.3.mlp.linear_fc2.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.4.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.28.input_layernorm.weight key (0.0, 1.0, False, False)
INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.4.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.28.self_attention.linear_proj.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.28.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.4.self_attention.linear_proj.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.28.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.4.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.4.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.28.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.28.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.4.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.4.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.28.mlp.linear_fc2.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.29.input_layernorm.weight key (0.0, 1.0, False, False)
INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.4.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.29.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.4.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.29.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.29.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.4.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.29.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.4.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.29.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.29.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.4.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.30.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.4.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.30.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.4.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.30.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.30.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.5.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.30.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.5.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.30.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.5.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.30.mlp.linear_fc2.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.31.input_layernorm.weight key (0.0, 1.0, False, False)
INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.5.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.5.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.31.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.31.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.5.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.31.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.5.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.31.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.5.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.31.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.31.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.5.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.32.input_layernorm.weight key (0.0, 1.0, False, False)
INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.5.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.32.self_attention.linear_proj.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.32.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.5.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.32.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.5.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.32.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.5.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.32.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.5.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.32.mlp.linear_fc2.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.33.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.6.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.33.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.6.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.33.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.6.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.33.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.33.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.6.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.33.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.6.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.33.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.6.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.34.input_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.6.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.34.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.34.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.6.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.34.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.6.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.34.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.34.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.6.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.34.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.6.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.35.input_layernorm.weight key (0.0, 1.0, False, False)
INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.6.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.35.self_attention.linear_proj.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.6.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.35.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.35.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.6.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.35.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.7.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.35.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.7.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.35.mlp.linear_fc2.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.36.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.7.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.36.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.7.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.36.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.7.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.36.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.36.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.7.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.7.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.36.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.36.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.7.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.37.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.37.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.7.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.37.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.7.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.37.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.7.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.37.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.37.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.7.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.37.mlp.linear_fc2.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.7.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.38.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.38.self_attention.linear_proj.weight key (1.0, 1.0, False, False)INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.7.mlp.linear_fc2.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.38.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.8.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.38.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.8.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.38.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.38.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.8.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.38.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.8.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.39.input_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.8.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.39.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.39.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.8.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.39.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.8.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.39.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.8.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.39.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.39.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.8.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.40.input_layernorm.weight key (0.0, 1.0, False, False)
INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.8.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.40.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.8.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.40.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.40.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.8.mlp.linear_fc1.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.8.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.40.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.8.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.40.mlp.linear_fc1.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.9.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.40.mlp.linear_fc2.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.41.input_layernorm.weight key (0.0, 1.0, False, False)INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.9.input_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.9.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.41.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.41.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.9.self_attention.linear_proj.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.9.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.41.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)

INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.9.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.41.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.9.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.41.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.9.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.41.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.9.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.42.input_layernorm.weight key (0.0, 1.0, False, False)
INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.9.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.42.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.9.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.42.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.42.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.9.mlp.linear_fc1.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.9.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.42.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.9.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.42.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.10.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.42.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.10.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.43.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.10.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.43.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.10.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.43.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.10.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.43.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.10.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.43.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.10.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.43.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.10.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.43.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.10.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.44.input_layernorm.weight key (0.0, 1.0, False, False)
INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.10.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.44.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.10.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.44.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.10.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.44.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.10.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.44.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.10.mlp.linear_fc2.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.11.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.44.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.44.mlp.linear_fc2.weight key (1.0, 1.0, False, False)INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.11.input_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.11.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.45.input_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.45.self_attention.linear_proj.weight key (1.0, 1.0, False, False)INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.11.self_attention.linear_proj.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.11.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.45.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.45.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.11.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.11.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.45.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.11.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.45.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.11.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.45.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.46.input_layernorm.weight key (0.0, 1.0, False, False)INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.11.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.11.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.46.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.11.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.46.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.11.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.46.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.11.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.46.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.12.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.46.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.12.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.46.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.12.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.12.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.47.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.12.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.47.self_attention.linear_proj.weight key (1.0, 1.0, False, False)INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.12.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.12.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.47.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.12.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.47.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.12.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.47.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.12.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.47.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.12.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.47.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.final_layernorm.weight key (0.0, 1.0, False, False)INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.12.mlp.linear_fc1.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.12.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.output_layer.weight key (1.0, 1.0, False, False)
INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.12.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.13.input_layernorm.weight key (0.0, 1.0, False, False)
INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.13.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.13.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.13.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.13.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.13.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.13.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.13.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.13.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.13.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.13.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.13.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.13.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.13.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.14.input_layernorm.weight key (0.0, 1.0, False, False)
INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.14.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.14.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.14.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.14.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.14.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.14.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.14.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.14.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.14.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.14.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.14.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.14.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.14.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.15.input_layernorm.weight key (0.0, 1.0, False, False)
INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.15.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.15.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.15.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.15.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.15.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.15.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.15.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.15.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.15.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.15.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.15.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.15.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.15.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.16.input_layernorm.weight key (0.0, 1.0, False, False)
INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.16.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.16.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.16.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.16.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.16.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.16.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.16.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.16.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.16.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.16.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.16.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.16.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.16.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.17.input_layernorm.weight key (0.0, 1.0, False, False)
INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.17.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.17.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.17.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.17.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.17.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.17.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.17.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.17.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.17.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.17.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.17.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.17.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.17.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.18.input_layernorm.weight key (0.0, 1.0, False, False)
INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.18.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.18.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.18.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.18.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.18.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.18.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.18.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.18.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.18.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.18.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.18.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.18.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.18.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.19.input_layernorm.weight key (0.0, 1.0, False, False)
INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.19.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.19.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.19.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.19.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.19.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.19.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.19.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.19.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.19.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.19.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.19.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.19.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.19.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.20.input_layernorm.weight key (0.0, 1.0, False, False)
INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.20.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.20.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.20.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.20.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.20.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.20.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.20.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.20.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.20.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.20.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.20.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.20.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.20.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.21.input_layernorm.weight key (0.0, 1.0, False, False)
INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.21.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.21.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.21.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.21.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.21.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.21.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.21.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.21.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.21.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.21.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.21.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.21.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.21.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.22.input_layernorm.weight key (0.0, 1.0, False, False)
INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.22.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.22.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.22.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.22.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.22.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.22.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.22.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.22.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.22.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.22.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.22.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.22.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.22.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.23.input_layernorm.weight key (0.0, 1.0, False, False)
INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.23.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.23.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.23.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.23.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.23.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.23.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.23.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.23.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.23.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.23.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.23.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.23.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.23.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.24.input_layernorm.weight key (0.0, 1.0, False, False)
INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.24.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.24.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.24.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.24.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.24.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.24.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.24.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.24.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.24.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.24.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.24.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.24.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.24.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.25.input_layernorm.weight key (0.0, 1.0, False, False)
INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.25.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.25.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.25.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.25.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.25.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.25.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.25.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.25.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.25.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.25.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.25.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.25.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.25.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.26.input_layernorm.weight key (0.0, 1.0, False, False)
INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.26.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.26.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.26.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.26.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.26.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.26.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.26.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.26.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.26.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.26.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.26.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.26.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.26.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.27.input_layernorm.weight key (0.0, 1.0, False, False)
INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.27.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.27.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.27.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.27.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.27.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.27.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.27.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.27.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.27.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.27.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.27.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.27.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.27.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.28.input_layernorm.weight key (0.0, 1.0, False, False)
INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.28.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.28.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.28.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.28.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.28.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.28.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.28.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.28.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.28.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.28.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.28.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.28.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.28.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.29.input_layernorm.weight key (0.0, 1.0, False, False)
INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.29.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.29.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.29.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.29.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.29.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.29.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.29.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.29.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.29.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.29.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.29.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.29.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.29.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.30.input_layernorm.weight key (0.0, 1.0, False, False)
INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.30.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.30.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.30.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.30.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.30.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.30.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.30.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.30.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.30.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.30.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.30.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.30.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.30.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.31.input_layernorm.weight key (0.0, 1.0, False, False)
INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.31.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.31.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.31.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.31.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.31.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.31.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.31.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.31.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.31.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.31.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.31.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.31.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.31.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.32.input_layernorm.weight key (0.0, 1.0, False, False)
INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.32.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.32.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.32.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.32.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.32.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.32.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.32.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.32.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.32.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.32.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.32.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.external_feature_model.vision_projection.encoder.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.32.mlp.linear_fc2.weight key (1.0, 1.0, False, False)

INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.32.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.33.input_layernorm.weight key (0.0, 1.0, False, False)
INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.33.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.33.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.33.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.33.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.33.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.33.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.33.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.external_feature_model.vision_projection.encoder.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.33.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.33.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.external_feature_model.pre_proj_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.33.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.external_feature_model.pre_proj_layernorm.bias key (0.0, 1.0, False, False)
INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.33.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.embedding.word_embeddings.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.33.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.0.input_layernorm.weight key (0.0, 1.0, False, False)
INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.33.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.34.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.0.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.34.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.0.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.0.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.34.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.0.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.34.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.0.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.34.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.0.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.34.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.1.input_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.34.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.1.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.1.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.34.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.1.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.34.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.1.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.34.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.1.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.34.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.1.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.34.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.2.input_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.34.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.2.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.2.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.34.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.2.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.35.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.2.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.35.input_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.35.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.2.mlp.linear_fc1.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.2.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.35.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.3.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.35.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.3.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.3.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.35.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.35.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.3.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.3.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.35.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.3.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.3.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.35.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.4.input_layernorm.weight key (0.0, 1.0, False, False)
INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.35.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.4.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.35.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.4.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.4.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.35.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.4.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.35.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.4.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.4.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.35.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.5.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.36.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.5.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.36.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.5.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.5.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.36.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.5.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.36.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.5.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.36.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.5.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.36.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.6.input_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.36.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.6.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.6.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.36.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.6.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.36.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.6.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.6.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.36.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.6.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.36.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.7.input_layernorm.weight key (0.0, 1.0, False, False)
INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.36.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.7.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.36.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.7.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.7.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.36.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.7.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.37.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.7.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.37.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.7.mlp.linear_fc2.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.8.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.37.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.8.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.37.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.8.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.8.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.37.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.8.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.37.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.37.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.8.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.8.mlp.linear_fc2.weight key (1.0, 1.0, False, False)INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.37.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.9.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.37.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.9.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.9.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.37.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.9.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.37.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.9.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.37.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.9.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.37.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.9.mlp.linear_fc2.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.10.input_layernorm.weight key (0.0, 1.0, False, False)
INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.37.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.10.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.38.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.10.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.10.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.38.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.10.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.38.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.10.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.10.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.38.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.38.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.11.input_layernorm.weight key (0.0, 1.0, False, False)
INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.38.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.11.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.38.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.11.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.38.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.11.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.38.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.11.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.38.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.11.mlp.linear_fc1.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.38.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.11.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.12.input_layernorm.weight key (0.0, 1.0, False, False)INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.38.mlp.linear_fc1.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.38.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.12.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.12.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.38.mlp.linear_fc2.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.12.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.39.input_layernorm.weight key (0.0, 1.0, False, False)

INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.39.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.12.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.39.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.12.mlp.linear_fc1.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.12.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.39.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.39.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.13.input_layernorm.weight key (0.0, 1.0, False, False)
INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.39.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.13.self_attention.linear_proj.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.39.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.13.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.13.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.39.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.39.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.13.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.13.mlp.linear_fc1.weight key (1.0, 1.0, False, False)INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.39.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.39.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.13.mlp.linear_fc2.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.14.input_layernorm.weight key (0.0, 1.0, False, False)
INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.39.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.39.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.14.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.39.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.14.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.14.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.40.input_layernorm.weight key (0.0, 1.0, False, False)

INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.40.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.14.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.40.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.14.mlp.linear_fc1.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.14.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.40.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.40.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.15.input_layernorm.weight key (0.0, 1.0, False, False)

INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.40.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.15.self_attention.linear_proj.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.40.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.15.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.15.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.40.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.40.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.15.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.15.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.40.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.40.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.15.mlp.linear_fc2.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.16.input_layernorm.weight key (0.0, 1.0, False, False)
INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.40.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.40.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.16.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.16.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.40.mlp.linear_fc2.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.16.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.41.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.16.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.41.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.16.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.41.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.16.mlp.linear_fc2.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.17.input_layernorm.weight key (0.0, 1.0, False, False)
INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.41.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.17.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.41.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.17.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.17.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.41.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.41.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.17.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.17.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.41.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.17.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.18.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.41.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.18.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.18.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.41.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.18.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.41.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.18.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.41.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.18.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.41.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.18.mlp.linear_fc2.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.19.input_layernorm.weight key (0.0, 1.0, False, False)
INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.41.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.19.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.42.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.19.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.19.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.42.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.19.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.42.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.19.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.19.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.42.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.20.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.42.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.20.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.20.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.42.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.20.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.42.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.20.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.42.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.20.mlp.linear_fc1.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.20.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.42.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.21.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.21.self_attention.linear_proj.weight key (1.0, 1.0, False, False)INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.42.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.21.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.42.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.21.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.42.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.21.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.42.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.21.mlp.linear_fc1.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.21.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.42.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.22.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.43.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.22.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.22.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.43.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.22.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.22.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.43.self_attention.linear_proj.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.22.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.43.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.22.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.43.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.23.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.23.self_attention.linear_proj.weight key (1.0, 1.0, False, False)INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.43.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.43.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.23.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.23.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.43.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.23.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.43.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.23.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.23.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.43.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.24.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.43.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.24.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.24.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.43.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.24.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.43.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.24.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.43.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.24.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.44.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.24.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.44.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.25.input_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.25.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.44.self_attention.linear_proj.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.25.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.44.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.25.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.44.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.25.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.25.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.44.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.44.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.25.mlp.linear_fc2.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.26.input_layernorm.weight key (0.0, 1.0, False, False)
INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.44.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.26.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.44.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.26.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.26.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.44.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.26.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.44.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.26.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.26.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.44.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.27.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.44.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.27.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.27.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.44.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.27.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.45.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.27.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.27.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.45.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.27.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.45.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.28.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.28.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.45.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.28.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.45.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.28.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.45.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.28.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.45.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.28.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.45.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.28.mlp.linear_fc2.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.29.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.45.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.29.self_attention.linear_proj.weight key (1.0, 1.0, False, False)INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.45.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.45.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.29.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.29.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.45.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.45.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.29.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.45.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.29.mlp.linear_fc1.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.29.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.46.input_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.30.input_layernorm.weight key (0.0, 1.0, False, False)
INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.46.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.30.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.46.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.30.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.30.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.46.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.46.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.30.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.30.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.46.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.30.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.46.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.31.input_layernorm.weight key (0.0, 1.0, False, False)
INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.46.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.31.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.46.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.31.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.31.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.46.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.31.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.46.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.31.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.31.mlp.linear_fc2.weight key (1.0, 1.0, False, False)INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.46.mlp.linear_fc1.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.46.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.32.input_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.32.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.46.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.32.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.32.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.47.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.32.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.32.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.47.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.32.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.47.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.33.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.33.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.47.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.33.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.47.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.33.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.33.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.47.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.47.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.33.mlp.linear_fc1.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.33.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.47.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.34.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.47.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.34.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.34.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.47.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.34.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.47.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.34.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.47.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.34.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.47.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.34.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.35.input_layernorm.weight key (0.0, 1.0, False, False)
INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.47.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.35.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.final_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.35.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.final_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.35.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)

_get_param_groups name module.module.output_layer.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.35.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.35.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.output_layer.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.35.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.36.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.36.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.36.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.36.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.36.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.36.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.36.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.37.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.37.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.37.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.37.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.37.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.37.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.37.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.38.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.38.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.38.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.38.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.38.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.38.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.38.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.39.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.39.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.39.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.39.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.39.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.39.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.39.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.40.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.40.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.40.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.40.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.40.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.40.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.40.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.41.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.41.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.41.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.41.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.41.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.41.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.41.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.42.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.42.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.42.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.42.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.42.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.42.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.42.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.43.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.43.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.43.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.43.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.43.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.43.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.43.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.44.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.44.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.44.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.44.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.44.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.44.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.44.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.45.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.45.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.45.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.45.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.45.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.45.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.45.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.46.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.46.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.46.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.46.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.46.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.46.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.46.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.47.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.47.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.47.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.47.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.47.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.47.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.47.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.final_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.output_layer.weight key (1.0, 1.0, False, False)
> learning rate decay style: cosine
_load_base_checkpoint iteration_load_base_checkpoint iteration_load_base_checkpoint iteration_load_base_checkpoint iteration_load_base_checkpoint iteration_load_base_checkpoint iteration_load_base_checkpoint iteration_load_base_checkpoint iteration_load_base_checkpoint iteration_load_base_checkpoint iteration_load_base_checkpoint iteration_load_base_checkpoint iteration _load_base_checkpoint iteration    5000 500050005000_load_base_checkpoint iteration  _load_base_checkpoint iteration 5000_load_base_checkpoint iteration 
50005000 

 5000_load_base_checkpoint release
5000

 500050005000 
5000
5000
_load_base_checkpoint release _load_base_checkpoint release
 _load_base_checkpoint release 
_load_base_checkpoint release_load_base_checkpoint release
_load_base_checkpoint release
 _load_base_checkpoint release_load_base_checkpoint releaseFalse_load_base_checkpoint release _load_base_checkpoint release
  5000_load_base_checkpoint releaseFalse  5000  _load_base_checkpoint release  
 FalseFalseFalse


False_load_base_checkpoint release

 FalseFalse
False
False
 FalseFalse
_load_base_checkpoint releaseFalse

_load_base_checkpoint releaseFalse

 
False
False
_load_base_checkpoint /data_2/output/LM/lcvlm_modellink/scripts/qwen25/finetune_qwen25_14b_intern_300m_ptd_tp8pp1_stage2.sh/20241014_131952/iter_0005000/mp_rank_01/model_optim_rng.pt
_load_base_checkpoint /data_2/output/LM/lcvlm_modellink/scripts/qwen25/finetune_qwen25_14b_intern_300m_ptd_tp8pp1_stage2.sh/20241014_131952/iter_0005000/mp_rank_04/model_optim_rng.pt
_load_base_checkpoint /data_2/output/LM/lcvlm_modellink/scripts/qwen25/finetune_qwen25_14b_intern_300m_ptd_tp8pp1_stage2.sh/20241014_131952/iter_0005000/mp_rank_02/model_optim_rng.pt
_load_base_checkpoint_load_base_checkpoint  /data_2/output/LM/lcvlm_modellink/scripts/qwen25/finetune_qwen25_14b_intern_300m_ptd_tp8pp1_stage2.sh/20241014_131952/iter_0005000/mp_rank_03/model_optim_rng.pt/data_2/output/LM/lcvlm_modellink/scripts/qwen25/finetune_qwen25_14b_intern_300m_ptd_tp8pp1_stage2.sh/20241014_131952/iter_0005000/mp_rank_06/model_optim_rng.pt

_load_base_checkpoint_load_base_checkpoint /data_2/output/LM/lcvlm_modellink/scripts/qwen25/finetune_qwen25_14b_intern_300m_ptd_tp8pp1_stage2.sh/20241014_131952/iter_0005000/mp_rank_02/model_optim_rng.pt 
/data_2/output/LM/lcvlm_modellink/scripts/qwen25/finetune_qwen25_14b_intern_300m_ptd_tp8pp1_stage2.sh/20241014_131952/iter_0005000/mp_rank_07/model_optim_rng.pt
_load_base_checkpoint /data_2/output/LM/lcvlm_modellink/scripts/qwen25/finetune_qwen25_14b_intern_300m_ptd_tp8pp1_stage2.sh/20241014_131952/iter_0005000/mp_rank_04/model_optim_rng.pt
_load_base_checkpoint /data_2/output/LM/lcvlm_modellink/scripts/qwen25/finetune_qwen25_14b_intern_300m_ptd_tp8pp1_stage2.sh/20241014_131952/iter_0005000/mp_rank_01/model_optim_rng.pt
_load_base_checkpoint /data_2/output/LM/lcvlm_modellink/scripts/qwen25/finetune_qwen25_14b_intern_300m_ptd_tp8pp1_stage2.sh/20241014_131952/iter_0005000/mp_rank_05/model_optim_rng.pt
_load_base_checkpoint /data_2/output/LM/lcvlm_modellink/scripts/qwen25/finetune_qwen25_14b_intern_300m_ptd_tp8pp1_stage2.sh/20241014_131952/iter_0005000/mp_rank_00/model_optim_rng.pt
_load_base_checkpoint /data_2/output/LM/lcvlm_modellink/scripts/qwen25/finetune_qwen25_14b_intern_300m_ptd_tp8pp1_stage2.sh/20241014_131952/iter_0005000/mp_rank_06/model_optim_rng.pt
_load_base_checkpoint /data_2/output/LM/lcvlm_modellink/scripts/qwen25/finetune_qwen25_14b_intern_300m_ptd_tp8pp1_stage2.sh/20241014_131952/iter_0005000/mp_rank_03/model_optim_rng.pt
_load_base_checkpoint /data_2/output/LM/lcvlm_modellink/scripts/qwen25/finetune_qwen25_14b_intern_300m_ptd_tp8pp1_stage2.sh/20241014_131952/iter_0005000/mp_rank_07/model_optim_rng.pt
_load_base_checkpoint /data_2/output/LM/lcvlm_modellink/scripts/qwen25/finetune_qwen25_14b_intern_300m_ptd_tp8pp1_stage2.sh/20241014_131952/iter_0005000/mp_rank_05/model_optim_rng.pt
 loading checkpoint from /data_2/output/LM/lcvlm_modellink/scripts/qwen25/finetune_qwen25_14b_intern_300m_ptd_tp8pp1_stage2.sh/20241014_131952/ at iteration 5000
_load_base_checkpoint /data_2/output/LM/lcvlm_modellink/scripts/qwen25/finetune_qwen25_14b_intern_300m_ptd_tp8pp1_stage2.sh/20241014_131952/iter_0005000/mp_rank_00/model_optim_rng.pt
load_checkpoint iteration 0
load_checkpoint release False
strict True
load_checkpoint iteration 0
load_checkpoint release False
strict True
load_checkpoint iteration 0
load_checkpoint release False
strict True
load_checkpoint iteration 0
load_checkpoint release False
strict True
load_checkpoint iteration 0
load_checkpoint release False
strict True
load_checkpoint iteration 0
load_checkpoint release False
strict True
load_checkpoint iteration 0
load_checkpoint release False
strict True
load_checkpoint iteration 0
load_checkpoint release False
strict True
load_checkpoint iteration 0
load_checkpoint release False
could not find arguments in the checkpoint ...
strict True
load_checkpoint iteration 0
load_checkpoint release False
strict True
load_checkpoint iteration 0
load_checkpoint release False
strict True
load_checkpoint iteration 0
load_checkpoint release False
strict True
load_checkpoint iteration 0
load_checkpoint release False
strict True
load_checkpoint iteration 0
load_checkpoint release False
strict True
load_checkpoint iteration 0
load_checkpoint release False
strict True
load_checkpoint iteration 0
load_checkpoint release False
strict True
 checkpoint version 3.0
  successfully loaded checkpoint from /data_2/output/LM/lcvlm_modellink/scripts/qwen25/finetune_qwen25_14b_intern_300m_ptd_tp8pp1_stage2.sh/20241014_131952/ [ t 0, p 0 ] at iteration 0
[after model, optimizer, and learning rate scheduler are built] datetime: 2024-11-27 12:45:29 
> building train, validation, and test datasets ...
 > datasets target sizes (minimum size):
    train:      64000
    validation: 0
    test:       0
INFO:megatron.core.datasets.blended_megatron_dataset_config:mock = False
INFO:megatron.core.datasets.blended_megatron_dataset_config:Let split_matrix = [(0, 1.0), None, None]
> building train, validation, and test datasets for GPT ...
> rank 12 does not create GPT datasets ...
> rank 9 does not create GPT datasets ...> rank 5 does not create GPT datasets ...

> rank 6 does not create GPT datasets ...
> rank 14 does not create GPT datasets ...
> rank 2 does not create GPT datasets ...
> rank 3 does not create GPT datasets ...
> rank 15 does not create GPT datasets ...
> rank 4 does not create GPT datasets ...> rank 10 does not create GPT datasets ...
> rank 11 does not create GPT datasets ...

> rank 7 does not create GPT datasets ...
> rank 13 does not create GPT datasets ...
> rank 8 is creating GPT datasets ...
tokenizer Qwen2TokenizerFast(name_or_path='/data_4/models/Qwen/Qwen2.5-14B-Instruct/', vocab_size=151643, model_max_length=131072, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'eos_token': '<|im_end|>', 'pad_token': '<|endoftext|>', 'additional_special_tokens': ['<|im_start|>', '<|im_end|>', '<|object_ref_start|>', '<|object_ref_end|>', '<|box_start|>', '<|box_end|>', '<|quad_start|>', '<|quad_end|>', '<|vision_start|>', '<|vision_end|>', '<|vision_pad|>', '<|image_pad|>', '<|video_pad|>']}, clean_up_tokenization_spaces=False),  added_tokens_decoder={
	151643: AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	151644: AddedToken("<|im_start|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	151645: AddedToken("<|im_end|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	151646: AddedToken("<|object_ref_start|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	151647: AddedToken("<|object_ref_end|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	151648: AddedToken("<|box_start|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	151649: AddedToken("<|box_end|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	151650: AddedToken("<|quad_start|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	151651: AddedToken("<|quad_end|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	151652: AddedToken("<|vision_start|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	151653: AddedToken("<|vision_end|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	151654: AddedToken("<|vision_pad|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	151655: AddedToken("<|image_pad|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	151656: AddedToken("<|video_pad|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	151657: AddedToken("<tool_call>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
	151658: AddedToken("</tool_call>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
	151659: AddedToken("<|fim_prefix|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
	151660: AddedToken("<|fim_middle|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
	151661: AddedToken("<|fim_suffix|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
	151662: AddedToken("<|fim_pad|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
	151663: AddedToken("<|repo_name|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
	151664: AddedToken("<|file_sep|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
	151665: AddedToken("<img>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	151666: AddedToken("</img>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	151667: AddedToken("<IMG_CONTEXT>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	151668: AddedToken("<vid>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	151669: AddedToken("</vid>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	151670: AddedToken("<VID_CONTEXT>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	151671: AddedToken("<patch>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	151672: AddedToken("</patch>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	151673: AddedToken("<PATCH_CONTEXT>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	151674: AddedToken("<quad>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	151675: AddedToken("</quad>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	151676: AddedToken("<ref>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	151677: AddedToken("</ref>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	151678: AddedToken("<box>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	151679: AddedToken("</box>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	151680: AddedToken("<image>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	151681: AddedToken("<video>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}
> rank 0 is creating GPT datasets ...
> rank 1 does not create GPT datasets ...
INFO:root:building dataset...
INFO:cognitron_vl.data.dataset_base:cfg {'xlsx_sample_num': 2, 'dataset': {'LLaVA': {'ratio': 0, 'data_paths': ['datasets/LMM/lcvlm_jsonl/liuhaotian/LLaVA-Instruct-150K/llava_v1_5_mix665k.jsonl']}, 'LLaVA-ReCap': {'ratio': 0.1, 'data_paths': ['datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-ReCap-558K/data.jsonl', 'datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-ReCap-118K/data.jsonl', 'datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-ReCap-CC3M/data.jsonl']}, 'ALLaVA': {'ratio': 0.1, 'data_paths': ['datasets/LMM/lcvlm_jsonl/FreedomIntelligence/ALLaVA-4V/allava_vflan/ALLaVA-Caption-VFLAN-4V.jsonl', 'datasets/LMM/lcvlm_jsonl/FreedomIntelligence/ALLaVA-4V/allava_laion/ALLaVA-Caption-LAION-4V.jsonl', 'datasets/LMM/lcvlm_jsonl/FreedomIntelligence/ALLaVA-4V/allava_vflan/ALLaVA-Instruct-VFLAN-4V.jsonl', 'datasets/LMM/lcvlm_jsonl/FreedomIntelligence/ALLaVA-4V/allava_laion/ALLaVA-Instruct-LAION-4V.jsonl', 'datasets/LMM/lcvlm_jsonl/FreedomIntelligence/ALLaVA-4V/allava_text/Evol-Instruct-GPT4-Turbo-143K.jsonl', 'datasets/LMM/lcvlm_jsonl/FreedomIntelligence/ALLaVA-4V/others/Instruct-LAION-4oiterative.jsonl', 'datasets/LMM/lcvlm_jsonl/FreedomIntelligence/ALLaVA-4V/others/Instruct-LAION-4v_gemini_claude-ensembled.jsonl']}, 'LVIS': {'ratio': 0.1, 'data_paths': ['datasets/LMM/lcvlm_jsonl/X2FD/LVIS-Instruct4V/lvis_instruct4v_220k.jsonl']}, 'ShareGPT4V': {'ratio': 0.1, 'data_paths': ['datasets/LMM/lcvlm_jsonl/Lin-Chen/ShareGPT4V/share-captioner_coco_lcs_sam_1246k_1107.jsonl', 'datasets/LMM/lcvlm_jsonl/Lin-Chen/ShareGPT4V/sharegpt4v_instruct_gpt4-vision_cap100k.jsonl']}, 'the_cauldron': {'ratio': 0.1, 'data_paths': ['datasets/LMM/lcvlm_jsonl/HuggingFaceM4/the_cauldron/chart2text.jsonl', 'datasets/LMM/lcvlm_jsonl/HuggingFaceM4/the_cauldron/cocoqa.jsonl', 'datasets/LMM/lcvlm_jsonl/HuggingFaceM4/the_cauldron/datikz.jsonl', 'datasets/LMM/lcvlm_jsonl/HuggingFaceM4/the_cauldron/diagram_image_to_text.jsonl', 'datasets/LMM/lcvlm_jsonl/HuggingFaceM4/the_cauldron/docvqa.jsonl', 'datasets/LMM/lcvlm_jsonl/HuggingFaceM4/the_cauldron/finqa.jsonl', 'datasets/LMM/lcvlm_jsonl/HuggingFaceM4/the_cauldron/geomverse.jsonl', 'datasets/LMM/lcvlm_jsonl/HuggingFaceM4/the_cauldron/iam.jsonl', 'datasets/LMM/lcvlm_jsonl/HuggingFaceM4/the_cauldron/intergps.jsonl', 'datasets/LMM/lcvlm_jsonl/HuggingFaceM4/the_cauldron/localized_narratives.jsonl', 'datasets/LMM/lcvlm_jsonl/HuggingFaceM4/the_cauldron/mimic_cgd.jsonl', 'datasets/LMM/lcvlm_jsonl/HuggingFaceM4/the_cauldron/multihiertt.jsonl', 'datasets/LMM/lcvlm_jsonl/HuggingFaceM4/the_cauldron/nlvr2.jsonl', 'datasets/LMM/lcvlm_jsonl/HuggingFaceM4/the_cauldron/ocrvqa.jsonl', 'datasets/LMM/lcvlm_jsonl/HuggingFaceM4/the_cauldron/okvqa.jsonl', 'datasets/LMM/lcvlm_jsonl/HuggingFaceM4/the_cauldron/plotqa.jsonl', 'datasets/LMM/lcvlm_jsonl/HuggingFaceM4/the_cauldron/raven.jsonl', 'datasets/LMM/lcvlm_jsonl/HuggingFaceM4/the_cauldron/rendered_text.jsonl', 'datasets/LMM/lcvlm_jsonl/HuggingFaceM4/the_cauldron/robut_sqa.jsonl', 'datasets/LMM/lcvlm_jsonl/HuggingFaceM4/the_cauldron/robut_wikisql.jsonl', 'datasets/LMM/lcvlm_jsonl/HuggingFaceM4/the_cauldron/screen2words.jsonl', 'datasets/LMM/lcvlm_jsonl/HuggingFaceM4/the_cauldron/spot_the_diff.jsonl', 'datasets/LMM/lcvlm_jsonl/HuggingFaceM4/the_cauldron/tabmwp.jsonl', 'datasets/LMM/lcvlm_jsonl/HuggingFaceM4/the_cauldron/tat_qa.jsonl', 'datasets/LMM/lcvlm_jsonl/HuggingFaceM4/the_cauldron/textvqa.jsonl', 'datasets/LMM/lcvlm_jsonl/HuggingFaceM4/the_cauldron/vistext.jsonl', 'datasets/LMM/lcvlm_jsonl/HuggingFaceM4/the_cauldron/visualmrc.jsonl', 'datasets/LMM/lcvlm_jsonl/HuggingFaceM4/the_cauldron/vqav2.jsonl', 'datasets/LMM/lcvlm_jsonl/HuggingFaceM4/the_cauldron/websight.jsonl']}, 'Docmatix': {'ratio': 0.1, 'data_paths': ['datasets/LMM/lcvlm_jsonl/HuggingFaceM4/Docmatix/data.jsonl']}, 'LLaVA-OneVision-Mid-Data': {'ratio': 0.1, 'data_paths': ['datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-OneVision-Mid-Data/evol_instruct/evol_instruct_processed.jsonl', 'datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-OneVision-Mid-Data/synthdog_en/synthdog_en_processed.jsonl', 'datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-OneVision-Mid-Data/synthdog_zh/synthdog_zh_processed.jsonl', 'datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-OneVision-Mid-Data/ureader_tr/ureader_tr_processed.jsonl']}, 'LLaVA-OneVision-Data': {'ratio': 0.1, 'data_paths': ['datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-OneVision-Data/CLEVR-Math(MathV360K).jsonl', 'datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-OneVision-Data/FigureQA(MathV360K).jsonl', 'datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-OneVision-Data/GEOS(MathV360K).jsonl', 'datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-OneVision-Data/GeoQA+(MathV360K).jsonl', 'datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-OneVision-Data/Geometry3K(MathV360K).jsonl', 'datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-OneVision-Data/IconQA(MathV360K).jsonl', 'datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-OneVision-Data/MapQA(MathV360K).jsonl', 'datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-OneVision-Data/PMC-VQA(MathV360K).jsonl', 'datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-OneVision-Data/Super-CLEVR(MathV360K).jsonl', 'datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-OneVision-Data/TabMWP(MathV360K).jsonl', 'datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-OneVision-Data/UniGeo(MathV360K).jsonl', 'datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-OneVision-Data/VisualWebInstruct(filtered).jsonl', 'datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-OneVision-Data/VizWiz(MathV360K).jsonl', 'datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-OneVision-Data/ai2d(cauldron,llava_format).jsonl', 'datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-OneVision-Data/ai2d(gpt4v).jsonl', 'datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-OneVision-Data/ai2d(internvl).jsonl', 'datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-OneVision-Data/aokvqa(cauldron,llava_format).jsonl', 'datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-OneVision-Data/cambrian(filtered).jsonl', 'datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-OneVision-Data/chartqa(cauldron,llava_format).jsonl', 'datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-OneVision-Data/chrome_writting.jsonl', 'datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-OneVision-Data/clevr(cauldron,llava_format).jsonl', 'datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-OneVision-Data/dvqa(cauldron,llava_format).jsonl', 'datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-OneVision-Data/figureqa(cauldron,llava_format).jsonl', 'datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-OneVision-Data/geo170k(align).jsonl', 'datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-OneVision-Data/geo170k(qa).jsonl', 'datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-OneVision-Data/geo3k.jsonl', 'datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-OneVision-Data/hateful_memes(cauldron,llava_format).jsonl', 'datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-OneVision-Data/hitab(cauldron,llava_format).jsonl', 'datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-OneVision-Data/hme100k.jsonl', 'datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-OneVision-Data/iconqa(cauldron,llava_format).jsonl', 'datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-OneVision-Data/iiit5k.jsonl', 'datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-OneVision-Data/image_textualization(filtered).jsonl', 'datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-OneVision-Data/infographic(gpt4v).jsonl', 'datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-OneVision-Data/infographic_vqa.jsonl', 'datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-OneVision-Data/infographic_vqa_llava_format.jsonl', 'datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-OneVision-Data/k12_printing.jsonl', 'datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-OneVision-Data/llavar_gpt4_20k.jsonl', 'datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-OneVision-Data/lrv_chart.jsonl', 'datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-OneVision-Data/lrv_normal(filtered).jsonl', 'datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-OneVision-Data/magpie_pro(l3_80b_mt).jsonl', 'datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-OneVision-Data/magpie_pro(l3_80b_st).jsonl', 'datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-OneVision-Data/magpie_pro(qwen2_72b_st).jsonl', 'datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-OneVision-Data/mapqa(cauldron,llava_format).jsonl', 'datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-OneVision-Data/mathqa.jsonl', 'datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-OneVision-Data/mavis_math_metagen.jsonl', 'datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-OneVision-Data/mavis_math_rule_geo.jsonl', 'datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-OneVision-Data/orand_car_a.jsonl', 'datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-OneVision-Data/robut_wtq(cauldron,llava_format).jsonl', 'datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-OneVision-Data/scienceqa(cauldron,llava_format).jsonl', 'datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-OneVision-Data/scienceqa(nona_context).jsonl', 'datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-OneVision-Data/sharegpt4o.jsonl', 'datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-OneVision-Data/sroie.jsonl', 'datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-OneVision-Data/st_vqa(cauldron,llava_format).jsonl', 'datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-OneVision-Data/tallyqa(cauldron,llava_format).jsonl', 'datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-OneVision-Data/textcaps.jsonl', 'datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-OneVision-Data/textocr(gpt4v).jsonl', 'datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-OneVision-Data/tqa(cauldron,llava_format).jsonl', 'datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-OneVision-Data/ureader_cap.jsonl', 'datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-OneVision-Data/ureader_ie.jsonl', 'datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-OneVision-Data/ureader_kg.jsonl', 'datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-OneVision-Data/ureader_qa.jsonl', 'datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-OneVision-Data/vision_flan(filtered).jsonl', 'datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-OneVision-Data/visual7w(cauldron,llava_format).jsonl', 'datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-OneVision-Data/vqarad(cauldron,llava_format).jsonl', 'datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-OneVision-Data/vsr(cauldron,llava_format).jsonl']}, 'M4-Instruct-Data': {'ratio': 0.1, 'data_paths': ['datasets/LMM/lcvlm_jsonl/lmms-lab/M4-Instruct-Data/m4_instruct_annotations.jsonl', 'datasets/LMM/lcvlm_jsonl/lmms-lab/M4-Instruct-Data/m4_instruct_video.jsonl']}, 'OpenHermes': {'ratio': 0.1, 'num': 300000, 'data_paths': ['datasets/LLM/jsons/teknium/OpenHermes-2.5/openhermes2_5.jsonl']}, 'lima': {'ratio': 0.1, 'data_paths': ['datasets/LLM/jsons/GAIR/lima/train.jsonl']}, 'databricks-dolly-15k': {'ratio': 0.1, 'data_paths': ['datasets/LLM/jsons/databricks/databricks-dolly-15k/databricks-dolly-15k.jsonl']}, 'MetaMathQA': {'ratio': 0.1, 'num': 100000, 'data_paths': ['datasets/LLM/jsons/meta-math/MetaMathQA/MetaMathQA-395K.jsonl']}, 'MathInstruct': {'ratio': 0.1, 'num': 80000, 'data_paths': ['datasets/LLM/jsons/TIGER-Lab/MathInstruct/MathInstruct.jsonl']}, 'orca-math-word-problems-200k': {'ratio': 0.1, 'num': 50000, 'data_paths': ['datasets/LLM/jsons/microsoft/orca-math-word-problems-200k/data.jsonl']}, 'atlas-math-sets': {'ratio': 0.1, 'num': 100000, 'data_paths': ['datasets/LLM/jsons/AtlasUnified/atlas-math-sets/train.jsonl']}, 'goat': {'ratio': 0.1, 'num': 30000, 'data_paths': ['datasets/LLM/jsons/tiedong/goat/dataset.jsonl']}, 'camel-ai': {'ratio': 0.1, 'num': 30000, 'data_paths': ['datasets/LLM/jsons/camel-ai/math/math.jsonl']}, 'Long-Instruction-with-Paraphrasing': {'ratio': 1.0, 'data_paths': ['datasets/LLM/jsons/yuyijiong/Long-Instruction-with-Paraphrasing/booksum_en.jsonl', 'datasets/LLM/jsons/yuyijiong/Long-Instruction-with-Paraphrasing/multi_doc_qa_en_paraphrasing.jsonl', 'datasets/LLM/jsons/yuyijiong/Long-Instruction-with-Paraphrasing/sharegpt_en.jsonl', 'datasets/LLM/jsons/yuyijiong/Long-Instruction-with-Paraphrasing/short_instruction_from_alpaca_en.jsonl', 'datasets/LLM/jsons/yuyijiong/Long-Instruction-with-Paraphrasing/single_doc_qa_en_paraphrasing.jsonl', 'datasets/LLM/jsons/yuyijiong/Long-Instruction-with-Paraphrasing/translation_en2zh.jsonl', 'datasets/LLM/jsons/yuyijiong/Long-Instruction-with-Paraphrasing/booksum_zh.jsonl', 'datasets/LLM/jsons/yuyijiong/Long-Instruction-with-Paraphrasing/multi_doc_qa_zh_paraphrasing.jsonl', 'datasets/LLM/jsons/yuyijiong/Long-Instruction-with-Paraphrasing/sharegpt_zh.jsonl', 'datasets/LLM/jsons/yuyijiong/Long-Instruction-with-Paraphrasing/short_instruction_from_llama_chinese.jsonl', 'datasets/LLM/jsons/yuyijiong/Long-Instruction-with-Paraphrasing/single_doc_qa_zh_paraphrasing.jsonl']}, 'Long': {'ratio': 1.0, 'data_paths': ['datasets/LLM/jsons/akoksal/LongForm/data.jsonl', 'datasets/LLM/jsons/THUDM/LongAlign-10k/long.jsonl', 'datasets/LLM/jsons/THUDM/LongCite-45k/long.jsonl', 'datasets/LLM/jsons/THUDM/LongWriter-6k/long.jsonl', 'datasets/LLM/jsons/YeungNLP/LongQLoRA-Dataset/LongQLoRA-SFT-Data-39k.jsonl', 'datasets/LLM/jsons/Yukang/LongAlpaca-12k/LongAlpaca-12k.jsonl', 'datasets/LLM/jsons/togethercomputer/Long-Data-Collections/natural_questions_10_200_docs.jsonl', 'datasets/LLM/jsons/togethercomputer/Long-Data-Collections/booksum.jsonl']}, 'VideoGPT-plus_Training_Dataset': {'ratio': 0.1, 'data_paths': ['datasets/LMM/lcvlm_jsonl/MBZUAI/VideoGPT-plus_Training_Dataset/caption_videochat.jsonl', 'datasets/LMM/lcvlm_jsonl/MBZUAI/VideoGPT-plus_Training_Dataset/classification_k710.jsonl', 'datasets/LMM/lcvlm_jsonl/MBZUAI/VideoGPT-plus_Training_Dataset/classification_ssv2.jsonl', 'datasets/LMM/lcvlm_jsonl/MBZUAI/VideoGPT-plus_Training_Dataset/conversation_videochat1.jsonl', 'datasets/LMM/lcvlm_jsonl/MBZUAI/VideoGPT-plus_Training_Dataset/conversation_videochatgpt.jsonl', 'datasets/LMM/lcvlm_jsonl/MBZUAI/VideoGPT-plus_Training_Dataset/reasoning_clevrer_mc.jsonl', 'datasets/LMM/lcvlm_jsonl/MBZUAI/VideoGPT-plus_Training_Dataset/reasoning_clevrer_qa.jsonl', 'datasets/LMM/lcvlm_jsonl/MBZUAI/VideoGPT-plus_Training_Dataset/reasoning_next_qa.jsonl', 'datasets/LMM/lcvlm_jsonl/MBZUAI/VideoGPT-plus_Training_Dataset/vcg_human_annotated.jsonl', 'datasets/LMM/lcvlm_jsonl/MBZUAI/VideoGPT-plus_Training_Dataset/vcg-plus_112K.jsonl', 'datasets/LMM/lcvlm_jsonl/MBZUAI/VideoGPT-plus_Training_Dataset/vqa_webvid_qa.jsonl']}, 'Share14/ShareGemini_cap': {'ratio': 0.1, 'data_paths': ['datasets/LMM/lcvlm_jsonl/Share14/ShareGemini/sharegemini_k400.jsonl', 'datasets/LMM/lcvlm_jsonl/Share14/ShareGemini/sharegemini_webvid_core100k.jsonl']}, 'Movie': {'ratio': 0.0, 'data_paths': ['datasets/LMM/lcvlm_jsonl/MovieNet/MovieNet.jsonl']}, 'Comic': {'ratio': 1.0, 'data_paths': ['datasets/LMM/lcvlm_jsonl/Comic/summary.jsonl']}, 'lmms-lab/LLaVA-Video-178K_0_30_s_cap': {'ratio': 1.0, 'data_paths': ['datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-Video-178K/0_30_s_academic_v0_1/0_30_s_academic_v0_1_cap_processed.jsonl', 'datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-Video-178K/0_30_s_youtube_v0_1/0_30_s_youtube_v0_1_cap_processed.jsonl']}, 'lmms-lab/LLaVA-Video-178K_0_30_s_qa': {'ratio': 1.0, 'data_paths': ['datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-Video-178K/0_30_s_academic_v0_1/0_30_s_academic_mc_v0_1_qa_processed.jsonl', 'datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-Video-178K/0_30_s_academic_v0_1/0_30_s_academic_oe_v0_1_qa_processed.jsonl', 'datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-Video-178K/0_30_s_activitynetqa/0_30_s_activitynetqa_oe_qa_processed.jsonl', 'datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-Video-178K/0_30_s_nextqa/0_30_s_nextqa_mc_qa_processed.jsonl', 'datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-Video-178K/0_30_s_nextqa/0_30_s_nextqa_oe_qa_processed.jsonl', 'datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-Video-178K/0_30_s_perceptiontest/0_30_s_perceptiontest_mc_qa_processed.jsonl', 'datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-Video-178K/0_30_s_youtube_v0_1/0_30_s_youtube_mc_v0_1_qa_processed.jsonl', 'datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-Video-178K/0_30_s_youtube_v0_1/0_30_s_youtube_oe_v0_1_qa_processed.jsonl']}, 'lmms-lab/LLaVA-Video-178K_1_2_m_cap': {'ratio': 1.0, 'data_paths': ['datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-Video-178K/1_2_m_academic_v0_1/1_2_m_academic_v0_1_cap_processed.jsonl', 'datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-Video-178K/1_2_m_youtube_v0_1/1_2_m_youtube_v0_1_cap_processed.jsonl']}, 'lmms-lab/LLaVA-Video-178K_1_2_m_qa': {'ratio': 1.0, 'data_paths': ['datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-Video-178K/1_2_m_academic_v0_1/1_2_m_academic_mc_v0_1_qa_processed.jsonl', 'datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-Video-178K/1_2_m_academic_v0_1/1_2_m_academic_oe_v0_1_qa_processed.jsonl', 'datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-Video-178K/1_2_m_activitynetqa/1_2_m_activitynetqa_oe_qa_processed.jsonl', 'datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-Video-178K/1_2_m_nextqa/1_2_m_nextqa_mc_qa_processed.jsonl', 'datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-Video-178K/1_2_m_nextqa/1_2_m_nextqa_oe_qa_processed.jsonl', 'datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-Video-178K/1_2_m_youtube_v0_1/1_2_m_youtube_mc_v0_1_qa_processed.jsonl', 'datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-Video-178K/1_2_m_youtube_v0_1/1_2_m_youtube_oe_v0_1_qa_processed.jsonl']}, 'lmms-lab/LLaVA-Video-178K_2_3_m_cap': {'ratio': 1.0, 'data_paths': ['datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-Video-178K/2_3_m_academic_v0_1/2_3_m_academic_v0_1_cap_processed.jsonl', 'datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-Video-178K/2_3_m_youtube_v0_1/2_3_m_youtube_v0_1_cap_processed.jsonl']}, 'lmms-lab/LLaVA-Video-178K_2_3_m_qa': {'ratio': 1.0, 'data_paths': ['datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-Video-178K/2_3_m_academic_v0_1/2_3_m_academic_mc_v0_1_qa_processed.jsonl', 'datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-Video-178K/2_3_m_academic_v0_1/2_3_m_academic_oe_v0_1_qa_processed.jsonl', 'datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-Video-178K/2_3_m_activitynetqa/2_3_m_activitynetqa_oe_qa_processed.jsonl', 'datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-Video-178K/2_3_m_nextqa/2_3_m_nextqa_mc_qa_processed.jsonl', 'datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-Video-178K/2_3_m_nextqa/2_3_m_nextqa_oe_qa_processed.jsonl', 'datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-Video-178K/2_3_m_youtube_v0_1/2_3_m_youtube_mc_v0_1_qa_processed.jsonl', 'datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-Video-178K/2_3_m_youtube_v0_1/2_3_m_youtube_oe_v0_1_qa_processed.jsonl']}, 'lmms-lab/LLaVA-Video-178K_30_60_s_cap': {'ratio': 1.0, 'data_paths': ['datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-Video-178K/30_60_s_academic_v0_1/30_60_s_academic_v0_1_cap_processed.jsonl', 'datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-Video-178K/30_60_s_youtube_v0_1/30_60_s_youtube_v0_1_cap_processed.jsonl']}, 'lmms-lab/LLaVA-Video-178K_30_60_s_qa': {'ratio': 1.0, 'data_paths': ['datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-Video-178K/30_60_s_academic_v0_1/30_60_s_academic_mc_v0_1_qa_processed.jsonl', 'datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-Video-178K/30_60_s_academic_v0_1/30_60_s_academic_oe_v0_1_qa_processed.jsonl', 'datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-Video-178K/30_60_s_activitynetqa/30_60_s_activitynetqa_oe_qa_processed.jsonl', 'datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-Video-178K/30_60_s_nextqa/30_60_s_nextqa_mc_qa_processed.jsonl', 'datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-Video-178K/30_60_s_nextqa/30_60_s_nextqa_oe_qa_processed.jsonl', 'datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-Video-178K/30_60_s_perceptiontest/30_60_s_perceptiontest_mc_qa_processed.jsonl', 'datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-Video-178K/30_60_s_youtube_v0_1/30_60_s_youtube_mc_v0_1_qa_processed.jsonl', 'datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-Video-178K/30_60_s_youtube_v0_1/30_60_s_youtube_oe_v0_1_qa_processed.jsonl', 'datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-Video-178K/llava_hound/sharegptvideo_qa_255k_processed.jsonl']}}}
target_ratios [(1, 1), (1, 2), (2, 1), (3, 1), (1, 3), (2, 2), (4, 1), (1, 4), (5, 1), (1, 5), (1, 6), (6, 1), (3, 2), (2, 3), (7, 1), (1, 7), (4, 2), (2, 4), (1, 8), (8, 1), (1, 9), (3, 3), (9, 1), (2, 5), (5, 2), (10, 1), (1, 10), (11, 1), (1, 11), (12, 1), (3, 4), (4, 3), (1, 12), (6, 2), (2, 6)]
possible_resolutions [[448, 448], [448, 896], [896, 448], [1344, 448], [448, 1344], [896, 896], [1792, 448], [448, 1792], [2240, 448], [448, 2240], [448, 2688], [2688, 448], [1344, 896], [896, 1344], [3136, 448], [448, 3136], [1792, 896], [896, 1792], [448, 3584], [3584, 448], [448, 4032], [1344, 1344], [4032, 448], [896, 2240], [2240, 896], [4480, 448], [448, 4480], [4928, 448], [448, 4928], [5376, 448], [1344, 1792], [1792, 1344], [448, 5376], [2688, 896], [896, 2688]]
target_ratios [(1, 1), (1, 2), (2, 1), (3, 1), (1, 3), (2, 2), (4, 1), (1, 4), (5, 1), (1, 5), (1, 6), (6, 1), (3, 2), (2, 3), (7, 1), (1, 7), (4, 2), (2, 4), (1, 8), (8, 1), (1, 9), (3, 3), (9, 1), (2, 5), (5, 2), (10, 1), (1, 10), (11, 1), (1, 11), (12, 1), (3, 4), (4, 3), (1, 12), (6, 2), (2, 6)]
possible_resolutions [[448, 448], [448, 896], [896, 448], [1344, 448], [448, 1344], [896, 896], [1792, 448], [448, 1792], [2240, 448], [448, 2240], [448, 2688], [2688, 448], [1344, 896], [896, 1344], [3136, 448], [448, 3136], [1792, 896], [896, 1792], [448, 3584], [3584, 448], [448, 4032], [1344, 1344], [4032, 448], [896, 2240], [2240, 896], [4480, 448], [448, 4480], [4928, 448], [448, 4928], [5376, 448], [1344, 1792], [1792, 1344], [448, 5376], [2688, 896], [896, 2688]]
INFO:cognitron_vl.data.dataset_base:total_num 558128
INFO:cognitron_vl.data.dataset_base:data_ratio 0.1
INFO:cognitron_vl.data.dataset_base:data_num 999999999
INFO:cognitron_vl.data.dataset_base:used_num 55812
INFO:cognitron_vl.data.dataset_base:this_data Dataset({
    features: ['conversations', 'images', 'source', 'videos'],
    num_rows: 55812
})
INFO:cognitron_vl.data.dataset_base:raw_data Dataset({
    features: ['conversations', 'images', 'source', 'videos'],
    num_rows: 55812
})
INFO:cognitron_vl.data.dataset_base:Successful load datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-ReCap-558K/data.jsonl
INFO:cognitron_vl.data.dataset_base:total_num 118319
INFO:cognitron_vl.data.dataset_base:data_ratio 0.1
INFO:cognitron_vl.data.dataset_base:data_num 999999999
INFO:cognitron_vl.data.dataset_base:used_num 11831
INFO:cognitron_vl.data.dataset_base:this_data Dataset({
    features: ['conversations', 'images', 'source', 'videos'],
    num_rows: 11831
})
INFO:cognitron_vl.data.dataset_base:raw_data Dataset({
    features: ['conversations', 'images', 'source', 'videos'],
    num_rows: 67643
})
INFO:cognitron_vl.data.dataset_base:Successful load datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-ReCap-118K/data.jsonl
INFO:cognitron_vl.data.dataset_base:total_num 2857560
INFO:cognitron_vl.data.dataset_base:data_ratio 0.1
INFO:cognitron_vl.data.dataset_base:data_num 999999999
INFO:cognitron_vl.data.dataset_base:used_num 285756
INFO:cognitron_vl.data.dataset_base:this_data Dataset({
    features: ['conversations', 'images', 'source', 'videos'],
    num_rows: 285756
})
INFO:cognitron_vl.data.dataset_base:raw_data Dataset({
    features: ['conversations', 'images', 'source', 'videos'],
    num_rows: 353399
})
INFO:cognitron_vl.data.dataset_base:Successful load datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-ReCap-CC3M/data.jsonl
INFO:cognitron_vl.data.dataset_base:total_num 185352
INFO:cognitron_vl.data.dataset_base:data_ratio 0.1
INFO:cognitron_vl.data.dataset_base:data_num 999999999
INFO:cognitron_vl.data.dataset_base:used_num 18535
INFO:cognitron_vl.data.dataset_base:this_data Dataset({
    features: ['conversations', 'images', 'source', 'videos'],
    num_rows: 18535
})
INFO:cognitron_vl.data.dataset_base:raw_data Dataset({
    features: ['conversations', 'images', 'source', 'videos'],
    num_rows: 371934
})
INFO:cognitron_vl.data.dataset_base:Successful load datasets/LMM/lcvlm_jsonl/FreedomIntelligence/ALLaVA-4V/allava_vflan/ALLaVA-Caption-VFLAN-4V.jsonl
INFO:cognitron_vl.data.dataset_base:total_num 468497
INFO:cognitron_vl.data.dataset_base:data_ratio 0.1
INFO:cognitron_vl.data.dataset_base:data_num 999999999
INFO:cognitron_vl.data.dataset_base:used_num 46849
INFO:cognitron_vl.data.dataset_base:this_data Dataset({
    features: ['conversations', 'images', 'source', 'videos'],
    num_rows: 46849
})
INFO:cognitron_vl.data.dataset_base:raw_data Dataset({
    features: ['conversations', 'images', 'source', 'videos'],
    num_rows: 418783
})
INFO:cognitron_vl.data.dataset_base:Successful load datasets/LMM/lcvlm_jsonl/FreedomIntelligence/ALLaVA-4V/allava_laion/ALLaVA-Caption-LAION-4V.jsonl
INFO:cognitron_vl.data.dataset_base:total_num 77000
INFO:cognitron_vl.data.dataset_base:data_ratio 0.1
INFO:cognitron_vl.data.dataset_base:data_num 999999999
INFO:cognitron_vl.data.dataset_base:used_num 7700
INFO:cognitron_vl.data.dataset_base:this_data Dataset({
    features: ['conversations', 'images', 'source', 'videos'],
    num_rows: 7700
})
INFO:cognitron_vl.data.dataset_base:raw_data Dataset({
    features: ['conversations', 'images', 'source', 'videos'],
    num_rows: 426483
})
INFO:cognitron_vl.data.dataset_base:Successful load datasets/LMM/lcvlm_jsonl/FreedomIntelligence/ALLaVA-4V/allava_vflan/ALLaVA-Instruct-VFLAN-4V.jsonl
INFO:cognitron_vl.data.dataset_base:total_num 468497
INFO:cognitron_vl.data.dataset_base:data_ratio 0.1
INFO:cognitron_vl.data.dataset_base:data_num 999999999
INFO:cognitron_vl.data.dataset_base:used_num 46849
INFO:cognitron_vl.data.dataset_base:this_data Dataset({
    features: ['conversations', 'images', 'source', 'videos'],
    num_rows: 46849
})
INFO:cognitron_vl.data.dataset_base:raw_data Dataset({
    features: ['conversations', 'images', 'source', 'videos'],
    num_rows: 473332
})
INFO:cognitron_vl.data.dataset_base:Successful load datasets/LMM/lcvlm_jsonl/FreedomIntelligence/ALLaVA-4V/allava_laion/ALLaVA-Instruct-LAION-4V.jsonl
INFO:cognitron_vl.data.dataset_base:total_num 142964
INFO:cognitron_vl.data.dataset_base:data_ratio 0.1
INFO:cognitron_vl.data.dataset_base:data_num 999999999
INFO:cognitron_vl.data.dataset_base:used_num 14296
INFO:cognitron_vl.data.dataset_base:this_data Dataset({
    features: ['conversations', 'source', 'images', 'videos'],
    num_rows: 14296
})
INFO:cognitron_vl.data.dataset_base:raw_data Dataset({
    features: ['conversations', 'images', 'source', 'videos'],
    num_rows: 487628
})
INFO:cognitron_vl.data.dataset_base:Successful load datasets/LMM/lcvlm_jsonl/FreedomIntelligence/ALLaVA-4V/allava_text/Evol-Instruct-GPT4-Turbo-143K.jsonl
INFO:cognitron_vl.data.dataset_base:total_num 49898
INFO:cognitron_vl.data.dataset_base:data_ratio 0.1
INFO:cognitron_vl.data.dataset_base:data_num 999999999
INFO:cognitron_vl.data.dataset_base:used_num 4989
INFO:cognitron_vl.data.dataset_base:this_data Dataset({
    features: ['conversations', 'images', 'source', 'videos'],
    num_rows: 4989
})
INFO:cognitron_vl.data.dataset_base:raw_data Dataset({
    features: ['conversations', 'images', 'source', 'videos'],
    num_rows: 492617
})
INFO:cognitron_vl.data.dataset_base:Successful load datasets/LMM/lcvlm_jsonl/FreedomIntelligence/ALLaVA-4V/others/Instruct-LAION-4oiterative.jsonl
INFO:cognitron_vl.data.dataset_base:total_num 49925
INFO:cognitron_vl.data.dataset_base:data_ratio 0.1
INFO:cognitron_vl.data.dataset_base:data_num 999999999
INFO:cognitron_vl.data.dataset_base:used_num 4992
INFO:cognitron_vl.data.dataset_base:this_data Dataset({
    features: ['conversations', 'images', 'source', 'videos'],
    num_rows: 4992
})
INFO:cognitron_vl.data.dataset_base:raw_data Dataset({
    features: ['conversations', 'images', 'source', 'videos'],
    num_rows: 497609
})
INFO:cognitron_vl.data.dataset_base:Successful load datasets/LMM/lcvlm_jsonl/FreedomIntelligence/ALLaVA-4V/others/Instruct-LAION-4v_gemini_claude-ensembled.jsonl
INFO:cognitron_vl.data.dataset_base:total_num 222711
INFO:cognitron_vl.data.dataset_base:data_ratio 0.1
INFO:cognitron_vl.data.dataset_base:data_num 999999999
INFO:cognitron_vl.data.dataset_base:used_num 22271
INFO:cognitron_vl.data.dataset_base:this_data Dataset({
    features: ['conversations', 'images', 'source', 'videos'],
    num_rows: 22271
})
INFO:cognitron_vl.data.dataset_base:raw_data Dataset({
    features: ['conversations', 'images', 'source', 'videos'],
    num_rows: 519880
})
INFO:cognitron_vl.data.dataset_base:Successful load datasets/LMM/lcvlm_jsonl/X2FD/LVIS-Instruct4V/lvis_instruct4v_220k.jsonl
INFO:cognitron_vl.data.dataset_base:total_num 1244418
INFO:cognitron_vl.data.dataset_base:data_ratio 0.1
INFO:cognitron_vl.data.dataset_base:data_num 999999999
INFO:cognitron_vl.data.dataset_base:used_num 124441
INFO:cognitron_vl.data.dataset_base:this_data Dataset({
    features: ['conversations', 'images', 'source', 'videos'],
    num_rows: 124441
})
INFO:cognitron_vl.data.dataset_base:raw_data Dataset({
    features: ['conversations', 'images', 'source', 'videos'],
    num_rows: 644321
})
INFO:cognitron_vl.data.dataset_base:Successful load datasets/LMM/lcvlm_jsonl/Lin-Chen/ShareGPT4V/share-captioner_coco_lcs_sam_1246k_1107.jsonl
INFO:cognitron_vl.data.dataset_base:total_num 100024
INFO:cognitron_vl.data.dataset_base:data_ratio 0.1
INFO:cognitron_vl.data.dataset_base:data_num 999999999
INFO:cognitron_vl.data.dataset_base:used_num 10002
INFO:cognitron_vl.data.dataset_base:this_data Dataset({
    features: ['conversations', 'images', 'source', 'videos'],
    num_rows: 10002
})
INFO:cognitron_vl.data.dataset_base:raw_data Dataset({
    features: ['conversations', 'images', 'source', 'videos'],
    num_rows: 654323
})
INFO:cognitron_vl.data.dataset_base:Successful load datasets/LMM/lcvlm_jsonl/Lin-Chen/ShareGPT4V/sharegpt4v_instruct_gpt4-vision_cap100k.jsonl
INFO:cognitron_vl.data.dataset_base:total_num 26961
INFO:cognitron_vl.data.dataset_base:data_ratio 0.1
INFO:cognitron_vl.data.dataset_base:data_num 999999999
INFO:cognitron_vl.data.dataset_base:used_num 2696
INFO:cognitron_vl.data.dataset_base:this_data Dataset({
    features: ['conversations', 'images', 'source', 'videos'],
    num_rows: 2696
})
INFO:cognitron_vl.data.dataset_base:raw_data Dataset({
    features: ['conversations', 'images', 'source', 'videos'],
    num_rows: 657019
})
INFO:cognitron_vl.data.dataset_base:Successful load datasets/LMM/lcvlm_jsonl/HuggingFaceM4/the_cauldron/chart2text.jsonl
INFO:cognitron_vl.data.dataset_base:total_num 46287
INFO:cognitron_vl.data.dataset_base:data_ratio 0.1
INFO:cognitron_vl.data.dataset_base:data_num 999999999
INFO:cognitron_vl.data.dataset_base:used_num 4628
INFO:cognitron_vl.data.dataset_base:this_data Dataset({
    features: ['conversations', 'images', 'source', 'videos'],
    num_rows: 4628
})
INFO:cognitron_vl.data.dataset_base:raw_data Dataset({
    features: ['conversations', 'images', 'source', 'videos'],
    num_rows: 661647
})
INFO:cognitron_vl.data.dataset_base:Successful load datasets/LMM/lcvlm_jsonl/HuggingFaceM4/the_cauldron/cocoqa.jsonl
INFO:cognitron_vl.data.dataset_base:total_num 47974
INFO:cognitron_vl.data.dataset_base:data_ratio 0.1
INFO:cognitron_vl.data.dataset_base:data_num 999999999
INFO:cognitron_vl.data.dataset_base:used_num 4797
INFO:cognitron_vl.data.dataset_base:this_data Dataset({
    features: ['conversations', 'images', 'source', 'videos'],
    num_rows: 4797
})
INFO:cognitron_vl.data.dataset_base:raw_data Dataset({
    features: ['conversations', 'images', 'source', 'videos'],
    num_rows: 666444
})
INFO:cognitron_vl.data.dataset_base:Successful load datasets/LMM/lcvlm_jsonl/HuggingFaceM4/the_cauldron/datikz.jsonl
INFO:cognitron_vl.data.dataset_base:total_num 300
INFO:cognitron_vl.data.dataset_base:data_ratio 0.1
INFO:cognitron_vl.data.dataset_base:data_num 999999999
INFO:cognitron_vl.data.dataset_base:used_num 30
INFO:cognitron_vl.data.dataset_base:this_data Dataset({
    features: ['conversations', 'images', 'source', 'videos'],
    num_rows: 30
})
INFO:cognitron_vl.data.dataset_base:raw_data Dataset({
    features: ['conversations', 'images', 'source', 'videos'],
    num_rows: 666474
})
INFO:cognitron_vl.data.dataset_base:Successful load datasets/LMM/lcvlm_jsonl/HuggingFaceM4/the_cauldron/diagram_image_to_text.jsonl
INFO:cognitron_vl.data.dataset_base:total_num 10189
INFO:cognitron_vl.data.dataset_base:data_ratio 0.1
INFO:cognitron_vl.data.dataset_base:data_num 999999999
INFO:cognitron_vl.data.dataset_base:used_num 1018
INFO:cognitron_vl.data.dataset_base:this_data Dataset({
    features: ['conversations', 'images', 'source', 'videos'],
    num_rows: 1018
})
INFO:cognitron_vl.data.dataset_base:raw_data Dataset({
    features: ['conversations', 'images', 'source', 'videos'],
    num_rows: 667492
})
INFO:cognitron_vl.data.dataset_base:Successful load datasets/LMM/lcvlm_jsonl/HuggingFaceM4/the_cauldron/docvqa.jsonl
INFO:cognitron_vl.data.dataset_base:total_num 5276
INFO:cognitron_vl.data.dataset_base:data_ratio 0.1
INFO:cognitron_vl.data.dataset_base:data_num 999999999
INFO:cognitron_vl.data.dataset_base:used_num 527
INFO:cognitron_vl.data.dataset_base:this_data Dataset({
    features: ['conversations', 'images', 'source', 'videos'],
    num_rows: 527
})
INFO:cognitron_vl.data.dataset_base:raw_data Dataset({
    features: ['conversations', 'images', 'source', 'videos'],
    num_rows: 668019
})
INFO:cognitron_vl.data.dataset_base:Successful load datasets/LMM/lcvlm_jsonl/HuggingFaceM4/the_cauldron/finqa.jsonl
INFO:cognitron_vl.data.dataset_base:total_num 9303
INFO:cognitron_vl.data.dataset_base:data_ratio 0.1
INFO:cognitron_vl.data.dataset_base:data_num 999999999
INFO:cognitron_vl.data.dataset_base:used_num 930
INFO:cognitron_vl.data.dataset_base:this_data Dataset({
    features: ['conversations', 'images', 'source', 'videos'],
    num_rows: 930
})
INFO:cognitron_vl.data.dataset_base:raw_data Dataset({
    features: ['conversations', 'images', 'source', 'videos'],
    num_rows: 668949
})
INFO:cognitron_vl.data.dataset_base:Successful load datasets/LMM/lcvlm_jsonl/HuggingFaceM4/the_cauldron/geomverse.jsonl
INFO:cognitron_vl.data.dataset_base:total_num 5663
INFO:cognitron_vl.data.dataset_base:data_ratio 0.1
INFO:cognitron_vl.data.dataset_base:data_num 999999999
INFO:cognitron_vl.data.dataset_base:used_num 566
INFO:cognitron_vl.data.dataset_base:this_data Dataset({
    features: ['conversations', 'images', 'source', 'videos'],
    num_rows: 566
})
INFO:cognitron_vl.data.dataset_base:raw_data Dataset({
    features: ['conversations', 'images', 'source', 'videos'],
    num_rows: 669515
})
INFO:cognitron_vl.data.dataset_base:Successful load datasets/LMM/lcvlm_jsonl/HuggingFaceM4/the_cauldron/iam.jsonl
INFO:cognitron_vl.data.dataset_base:total_num 1280
INFO:cognitron_vl.data.dataset_base:data_ratio 0.1
INFO:cognitron_vl.data.dataset_base:data_num 999999999
INFO:cognitron_vl.data.dataset_base:used_num 128
INFO:cognitron_vl.data.dataset_base:this_data Dataset({
    features: ['conversations', 'images', 'source', 'videos'],
    num_rows: 128
})
INFO:cognitron_vl.data.dataset_base:raw_data Dataset({
    features: ['conversations', 'images', 'source', 'videos'],
    num_rows: 669643
})
INFO:cognitron_vl.data.dataset_base:Successful load datasets/LMM/lcvlm_jsonl/HuggingFaceM4/the_cauldron/intergps.jsonl
INFO:cognitron_vl.data.dataset_base:total_num 199998
INFO:cognitron_vl.data.dataset_base:data_ratio 0.1
INFO:cognitron_vl.data.dataset_base:data_num 999999999
INFO:cognitron_vl.data.dataset_base:used_num 19999
INFO:cognitron_vl.data.dataset_base:this_data Dataset({
    features: ['conversations', 'images', 'source', 'videos'],
    num_rows: 19999
})
INFO:cognitron_vl.data.dataset_base:raw_data Dataset({
    features: ['conversations', 'images', 'source', 'videos'],
    num_rows: 689642
})
INFO:cognitron_vl.data.dataset_base:Successful load datasets/LMM/lcvlm_jsonl/HuggingFaceM4/the_cauldron/localized_narratives.jsonl
INFO:cognitron_vl.data.dataset_base:total_num 70939
INFO:cognitron_vl.data.dataset_base:data_ratio 0.1
INFO:cognitron_vl.data.dataset_base:data_num 999999999
INFO:cognitron_vl.data.dataset_base:used_num 7093
INFO:cognitron_vl.data.dataset_base:this_data Dataset({
    features: ['conversations', 'images', 'source', 'videos'],
    num_rows: 7093
})
INFO:cognitron_vl.data.dataset_base:raw_data Dataset({
    features: ['conversations', 'images', 'source', 'videos'],
    num_rows: 696735
})
INFO:cognitron_vl.data.dataset_base:Successful load datasets/LMM/lcvlm_jsonl/HuggingFaceM4/the_cauldron/mimic_cgd.jsonl
INFO:cognitron_vl.data.dataset_base:total_num 7619
INFO:cognitron_vl.data.dataset_base:data_ratio 0.1
INFO:cognitron_vl.data.dataset_base:data_num 999999999
INFO:cognitron_vl.data.dataset_base:used_num 761
INFO:cognitron_vl.data.dataset_base:this_data Dataset({
    features: ['conversations', 'images', 'source', 'videos'],
    num_rows: 761
})
INFO:cognitron_vl.data.dataset_base:raw_data Dataset({
    features: ['conversations', 'images', 'source', 'videos'],
    num_rows: 697496
})
INFO:cognitron_vl.data.dataset_base:Successful load datasets/LMM/lcvlm_jsonl/HuggingFaceM4/the_cauldron/multihiertt.jsonl
INFO:cognitron_vl.data.dataset_base:total_num 50426
INFO:cognitron_vl.data.dataset_base:data_ratio 0.1
INFO:cognitron_vl.data.dataset_base:data_num 999999999
INFO:cognitron_vl.data.dataset_base:used_num 5042
INFO:cognitron_vl.data.dataset_base:this_data Dataset({
    features: ['conversations', 'images', 'source', 'videos'],
    num_rows: 5042
})
INFO:cognitron_vl.data.dataset_base:raw_data Dataset({
    features: ['conversations', 'images', 'source', 'videos'],
    num_rows: 702538
})
INFO:cognitron_vl.data.dataset_base:Successful load datasets/LMM/lcvlm_jsonl/HuggingFaceM4/the_cauldron/nlvr2.jsonl
INFO:cognitron_vl.data.dataset_base:total_num 165746
INFO:cognitron_vl.data.dataset_base:data_ratio 0.1
INFO:cognitron_vl.data.dataset_base:data_num 999999999
INFO:cognitron_vl.data.dataset_base:used_num 16574
INFO:cognitron_vl.data.dataset_base:this_data Dataset({
    features: ['conversations', 'images', 'source', 'videos'],
    num_rows: 16574
})
INFO:cognitron_vl.data.dataset_base:raw_data Dataset({
    features: ['conversations', 'images', 'source', 'videos'],
    num_rows: 719112
})
INFO:cognitron_vl.data.dataset_base:Successful load datasets/LMM/lcvlm_jsonl/HuggingFaceM4/the_cauldron/ocrvqa.jsonl
WARNING:cognitron_vl.data.dataset_base:Data file no found datasets/LMM/lcvlm_jsonl/HuggingFaceM4/the_cauldron/okvqa.jsonl
INFO:cognitron_vl.data.dataset_base:total_num 157070
INFO:cognitron_vl.data.dataset_base:data_ratio 0.1
INFO:cognitron_vl.data.dataset_base:data_num 999999999
INFO:cognitron_vl.data.dataset_base:used_num 15707
INFO:cognitron_vl.data.dataset_base:this_data Dataset({
    features: ['conversations', 'images', 'source', 'videos'],
    num_rows: 15707
})
INFO:cognitron_vl.data.dataset_base:raw_data Dataset({
    features: ['conversations', 'images', 'source', 'videos'],
    num_rows: 734819
})
INFO:cognitron_vl.data.dataset_base:Successful load datasets/LMM/lcvlm_jsonl/HuggingFaceM4/the_cauldron/plotqa.jsonl
INFO:cognitron_vl.data.dataset_base:total_num 42000
INFO:cognitron_vl.data.dataset_base:data_ratio 0.1
INFO:cognitron_vl.data.dataset_base:data_num 999999999
INFO:cognitron_vl.data.dataset_base:used_num 4200
INFO:cognitron_vl.data.dataset_base:this_data Dataset({
    features: ['conversations', 'images', 'source', 'videos'],
    num_rows: 4200
})
INFO:cognitron_vl.data.dataset_base:raw_data Dataset({
    features: ['conversations', 'images', 'source', 'videos'],
    num_rows: 739019
})
INFO:cognitron_vl.data.dataset_base:Successful load datasets/LMM/lcvlm_jsonl/HuggingFaceM4/the_cauldron/raven.jsonl
INFO:cognitron_vl.data.dataset_base:total_num 10000
INFO:cognitron_vl.data.dataset_base:data_ratio 0.1
INFO:cognitron_vl.data.dataset_base:data_num 999999999
INFO:cognitron_vl.data.dataset_base:used_num 1000
INFO:cognitron_vl.data.dataset_base:this_data Dataset({
    features: ['conversations', 'images', 'source', 'videos'],
    num_rows: 1000
})
INFO:cognitron_vl.data.dataset_base:raw_data Dataset({
    features: ['conversations', 'images', 'source', 'videos'],
    num_rows: 740019
})
INFO:cognitron_vl.data.dataset_base:Successful load datasets/LMM/lcvlm_jsonl/HuggingFaceM4/the_cauldron/rendered_text.jsonl
INFO:cognitron_vl.data.dataset_base:total_num 8514
INFO:cognitron_vl.data.dataset_base:data_ratio 0.1
INFO:cognitron_vl.data.dataset_base:data_num 999999999
INFO:cognitron_vl.data.dataset_base:used_num 851
INFO:cognitron_vl.data.dataset_base:this_data Dataset({
    features: ['conversations', 'images', 'source', 'videos'],
    num_rows: 851
})
INFO:cognitron_vl.data.dataset_base:raw_data Dataset({
    features: ['conversations', 'images', 'source', 'videos'],
    num_rows: 740870
})
INFO:cognitron_vl.data.dataset_base:Successful load datasets/LMM/lcvlm_jsonl/HuggingFaceM4/the_cauldron/robut_sqa.jsonl
INFO:cognitron_vl.data.dataset_base:total_num 74989
INFO:cognitron_vl.data.dataset_base:data_ratio 0.1
INFO:cognitron_vl.data.dataset_base:data_num 999999999
INFO:cognitron_vl.data.dataset_base:used_num 7498
INFO:cognitron_vl.data.dataset_base:this_data Dataset({
    features: ['conversations', 'images', 'source', 'videos'],
    num_rows: 7498
})
INFO:cognitron_vl.data.dataset_base:raw_data Dataset({
    features: ['conversations', 'images', 'source', 'videos'],
    num_rows: 748368
})
INFO:cognitron_vl.data.dataset_base:Successful load datasets/LMM/lcvlm_jsonl/HuggingFaceM4/the_cauldron/robut_wikisql.jsonl
INFO:cognitron_vl.data.dataset_base:total_num 15730
INFO:cognitron_vl.data.dataset_base:data_ratio 0.1
INFO:cognitron_vl.data.dataset_base:data_num 999999999
INFO:cognitron_vl.data.dataset_base:used_num 1573
INFO:cognitron_vl.data.dataset_base:this_data Dataset({
    features: ['conversations', 'images', 'source', 'videos'],
    num_rows: 1573
})
INFO:cognitron_vl.data.dataset_base:raw_data Dataset({
    features: ['conversations', 'images', 'source', 'videos'],
    num_rows: 749941
})
INFO:cognitron_vl.data.dataset_base:Successful load datasets/LMM/lcvlm_jsonl/HuggingFaceM4/the_cauldron/screen2words.jsonl
INFO:cognitron_vl.data.dataset_base:total_num 8566
INFO:cognitron_vl.data.dataset_base:data_ratio 0.1
INFO:cognitron_vl.data.dataset_base:data_num 999999999
INFO:cognitron_vl.data.dataset_base:used_num 856
INFO:cognitron_vl.data.dataset_base:this_data Dataset({
    features: ['conversations', 'images', 'source', 'videos'],
    num_rows: 856
})
INFO:cognitron_vl.data.dataset_base:raw_data Dataset({
    features: ['conversations', 'images', 'source', 'videos'],
    num_rows: 750797
})
INFO:cognitron_vl.data.dataset_base:Successful load datasets/LMM/lcvlm_jsonl/HuggingFaceM4/the_cauldron/spot_the_diff.jsonl
INFO:cognitron_vl.data.dataset_base:total_num 22722
INFO:cognitron_vl.data.dataset_base:data_ratio 0.1
INFO:cognitron_vl.data.dataset_base:data_num 999999999
INFO:cognitron_vl.data.dataset_base:used_num 2272
INFO:cognitron_vl.data.dataset_base:this_data Dataset({
    features: ['conversations', 'images', 'source', 'videos'],
    num_rows: 2272
})
INFO:cognitron_vl.data.dataset_base:raw_data Dataset({
    features: ['conversations', 'images', 'source', 'videos'],
    num_rows: 753069
})
INFO:cognitron_vl.data.dataset_base:Successful load datasets/LMM/lcvlm_jsonl/HuggingFaceM4/the_cauldron/tabmwp.jsonl
INFO:cognitron_vl.data.dataset_base:total_num 2199
INFO:cognitron_vl.data.dataset_base:data_ratio 0.1
INFO:cognitron_vl.data.dataset_base:data_num 999999999
INFO:cognitron_vl.data.dataset_base:used_num 219
INFO:cognitron_vl.data.dataset_base:this_data Dataset({
    features: ['conversations', 'images', 'source', 'videos'],
    num_rows: 219
})
INFO:cognitron_vl.data.dataset_base:raw_data Dataset({
    features: ['conversations', 'images', 'source', 'videos'],
    num_rows: 753288
})
INFO:cognitron_vl.data.dataset_base:Successful load datasets/LMM/lcvlm_jsonl/HuggingFaceM4/the_cauldron/tat_qa.jsonl
INFO:cognitron_vl.data.dataset_base:total_num 21953
INFO:cognitron_vl.data.dataset_base:data_ratio 0.1
INFO:cognitron_vl.data.dataset_base:data_num 999999999
INFO:cognitron_vl.data.dataset_base:used_num 2195
INFO:cognitron_vl.data.dataset_base:this_data Dataset({
    features: ['conversations', 'images', 'source', 'videos'],
    num_rows: 2195
})
INFO:cognitron_vl.data.dataset_base:raw_data Dataset({
    features: ['conversations', 'images', 'source', 'videos'],
    num_rows: 755483
})
INFO:cognitron_vl.data.dataset_base:Successful load datasets/LMM/lcvlm_jsonl/HuggingFaceM4/the_cauldron/textvqa.jsonl
INFO:cognitron_vl.data.dataset_base:total_num 9969
INFO:cognitron_vl.data.dataset_base:data_ratio 0.1
INFO:cognitron_vl.data.dataset_base:data_num 999999999
INFO:cognitron_vl.data.dataset_base:used_num 996
INFO:cognitron_vl.data.dataset_base:this_data Dataset({
    features: ['conversations', 'images', 'source', 'videos'],
    num_rows: 996
})
INFO:cognitron_vl.data.dataset_base:raw_data Dataset({
    features: ['conversations', 'images', 'source', 'videos'],
    num_rows: 756479
})
INFO:cognitron_vl.data.dataset_base:Successful load datasets/LMM/lcvlm_jsonl/HuggingFaceM4/the_cauldron/vistext.jsonl
INFO:cognitron_vl.data.dataset_base:total_num 3027
INFO:cognitron_vl.data.dataset_base:data_ratio 0.1
INFO:cognitron_vl.data.dataset_base:data_num 999999999
INFO:cognitron_vl.data.dataset_base:used_num 302
INFO:cognitron_vl.data.dataset_base:this_data Dataset({
    features: ['conversations', 'images', 'source', 'videos'],
    num_rows: 302
})
INFO:cognitron_vl.data.dataset_base:raw_data Dataset({
    features: ['conversations', 'images', 'source', 'videos'],
    num_rows: 756781
})
INFO:cognitron_vl.data.dataset_base:Successful load datasets/LMM/lcvlm_jsonl/HuggingFaceM4/the_cauldron/visualmrc.jsonl
INFO:cognitron_vl.data.dataset_base:total_num 82772
INFO:cognitron_vl.data.dataset_base:data_ratio 0.1
INFO:cognitron_vl.data.dataset_base:data_num 999999999
INFO:cognitron_vl.data.dataset_base:used_num 8277
INFO:cognitron_vl.data.dataset_base:this_data Dataset({
    features: ['conversations', 'images', 'source', 'videos'],
    num_rows: 8277
})
INFO:cognitron_vl.data.dataset_base:raw_data Dataset({
    features: ['conversations', 'images', 'source', 'videos'],
    num_rows: 765058
})
INFO:cognitron_vl.data.dataset_base:Successful load datasets/LMM/lcvlm_jsonl/HuggingFaceM4/the_cauldron/vqav2.jsonl
INFO:cognitron_vl.data.dataset_base:total_num 10000
INFO:cognitron_vl.data.dataset_base:data_ratio 0.1
INFO:cognitron_vl.data.dataset_base:data_num 999999999
INFO:cognitron_vl.data.dataset_base:used_num 1000
INFO:cognitron_vl.data.dataset_base:this_data Dataset({
    features: ['conversations', 'images', 'source', 'videos'],
    num_rows: 1000
})
INFO:cognitron_vl.data.dataset_base:raw_data Dataset({
    features: ['conversations', 'images', 'source', 'videos'],
    num_rows: 766058
})
INFO:cognitron_vl.data.dataset_base:Successful load datasets/LMM/lcvlm_jsonl/HuggingFaceM4/the_cauldron/websight.jsonl
INFO:cognitron_vl.data.dataset_base:total_num 1251003
INFO:cognitron_vl.data.dataset_base:data_ratio 0.1
INFO:cognitron_vl.data.dataset_base:data_num 999999999
INFO:cognitron_vl.data.dataset_base:used_num 125100
INFO:cognitron_vl.data.dataset_base:this_data Dataset({
    features: ['conversations', 'images', 'source', 'videos'],
    num_rows: 125100
})
INFO:cognitron_vl.data.dataset_base:raw_data Dataset({
    features: ['conversations', 'images', 'source', 'videos'],
    num_rows: 891158
})
INFO:cognitron_vl.data.dataset_base:Successful load datasets/LMM/lcvlm_jsonl/HuggingFaceM4/Docmatix/data.jsonl
INFO:cognitron_vl.data.dataset_base:total_num 143000
INFO:cognitron_vl.data.dataset_base:data_ratio 0.1
INFO:cognitron_vl.data.dataset_base:data_num 999999999
INFO:cognitron_vl.data.dataset_base:used_num 14300
INFO:cognitron_vl.data.dataset_base:this_data Dataset({
    features: ['conversations', 'images', 'source', 'videos'],
    num_rows: 14300
})
INFO:cognitron_vl.data.dataset_base:raw_data Dataset({
    features: ['conversations', 'images', 'source', 'videos'],
    num_rows: 905458
})
INFO:cognitron_vl.data.dataset_base:Successful load datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-OneVision-Mid-Data/evol_instruct/evol_instruct_processed.jsonl
INFO:cognitron_vl.data.dataset_base:total_num 100000
INFO:cognitron_vl.data.dataset_base:data_ratio 0.1
INFO:cognitron_vl.data.dataset_base:data_num 999999999
INFO:cognitron_vl.data.dataset_base:used_num 10000
INFO:cognitron_vl.data.dataset_base:this_data Dataset({
    features: ['conversations', 'images', 'source', 'videos'],
    num_rows: 10000
})
INFO:cognitron_vl.data.dataset_base:raw_data Dataset({
    features: ['conversations', 'images', 'source', 'videos'],
    num_rows: 915458
})
INFO:cognitron_vl.data.dataset_base:Successful load datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-OneVision-Mid-Data/synthdog_en/synthdog_en_processed.jsonl
INFO:cognitron_vl.data.dataset_base:total_num 100000
INFO:cognitron_vl.data.dataset_base:data_ratio 0.1
INFO:cognitron_vl.data.dataset_base:data_num 999999999
INFO:cognitron_vl.data.dataset_base:used_num 10000
INFO:cognitron_vl.data.dataset_base:this_data Dataset({
    features: ['conversations', 'images', 'source', 'videos'],
    num_rows: 10000
})
INFO:cognitron_vl.data.dataset_base:raw_data Dataset({
    features: ['conversations', 'images', 'source', 'videos'],
    num_rows: 925458
})
INFO:cognitron_vl.data.dataset_base:Successful load datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-OneVision-Mid-Data/synthdog_zh/synthdog_zh_processed.jsonl
INFO:cognitron_vl.data.dataset_base:total_num 101880
INFO:cognitron_vl.data.dataset_base:data_ratio 0.1
INFO:cognitron_vl.data.dataset_base:data_num 999999999
INFO:cognitron_vl.data.dataset_base:used_num 10188
INFO:cognitron_vl.data.dataset_base:this_data Dataset({
    features: ['conversations', 'images', 'source', 'videos'],
    num_rows: 10188
})
INFO:cognitron_vl.data.dataset_base:raw_data Dataset({
    features: ['conversations', 'images', 'source', 'videos'],
    num_rows: 935646
})
INFO:cognitron_vl.data.dataset_base:Successful load datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-OneVision-Mid-Data/ureader_tr/ureader_tr_processed.jsonl
INFO:cognitron_vl.data.dataset_base:total_num 5280
INFO:cognitron_vl.data.dataset_base:data_ratio 0.1
INFO:cognitron_vl.data.dataset_base:data_num 999999999
INFO:cognitron_vl.data.dataset_base:used_num 528
INFO:cognitron_vl.data.dataset_base:this_data Dataset({
    features: ['conversations', 'images', 'source', 'videos'],
    num_rows: 528
})
INFO:cognitron_vl.data.dataset_base:raw_data Dataset({
    features: ['conversations', 'images', 'source', 'videos'],
    num_rows: 936174
})
INFO:cognitron_vl.data.dataset_base:Successful load datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-OneVision-Data/CLEVR-Math(MathV360K).jsonl
INFO:cognitron_vl.data.dataset_base:total_num 17587
INFO:cognitron_vl.data.dataset_base:data_ratio 0.1
INFO:cognitron_vl.data.dataset_base:data_num 999999999
INFO:cognitron_vl.data.dataset_base:used_num 1758
INFO:cognitron_vl.data.dataset_base:this_data Dataset({
    features: ['conversations', 'images', 'source', 'videos'],
    num_rows: 1758
})
INFO:cognitron_vl.data.dataset_base:raw_data Dataset({
    features: ['conversations', 'images', 'source', 'videos'],
    num_rows: 937932
})
INFO:cognitron_vl.data.dataset_base:Successful load datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-OneVision-Data/FigureQA(MathV360K).jsonl
INFO:cognitron_vl.data.dataset_base:total_num 498
INFO:cognitron_vl.data.dataset_base:data_ratio 0.1
INFO:cognitron_vl.data.dataset_base:data_num 999999999
INFO:cognitron_vl.data.dataset_base:used_num 49
INFO:cognitron_vl.data.dataset_base:this_data Dataset({
    features: ['conversations', 'images', 'source', 'videos'],
    num_rows: 49
})
INFO:cognitron_vl.data.dataset_base:raw_data Dataset({
    features: ['conversations', 'images', 'source', 'videos'],
    num_rows: 937981
})
INFO:cognitron_vl.data.dataset_base:Successful load datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-OneVision-Data/GEOS(MathV360K).jsonl
INFO:cognitron_vl.data.dataset_base:total_num 17162
INFO:cognitron_vl.data.dataset_base:data_ratio 0.1
INFO:cognitron_vl.data.dataset_base:data_num 999999999
INFO:cognitron_vl.data.dataset_base:used_num 1716
INFO:cognitron_vl.data.dataset_base:this_data Dataset({
    features: ['conversations', 'images', 'source', 'videos'],
    num_rows: 1716
})
INFO:cognitron_vl.data.dataset_base:raw_data Dataset({
    features: ['conversations', 'images', 'source', 'videos'],
    num_rows: 939697
})
INFO:cognitron_vl.data.dataset_base:Successful load datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-OneVision-Data/GeoQA+(MathV360K).jsonl
INFO:cognitron_vl.data.dataset_base:total_num 9724
INFO:cognitron_vl.data.dataset_base:data_ratio 0.1
INFO:cognitron_vl.data.dataset_base:data_num 999999999
INFO:cognitron_vl.data.dataset_base:used_num 972
INFO:cognitron_vl.data.dataset_base:this_data Dataset({
    features: ['conversations', 'images', 'source', 'videos'],
    num_rows: 972
})
INFO:cognitron_vl.data.dataset_base:raw_data Dataset({
    features: ['conversations', 'images', 'source', 'videos'],
    num_rows: 940669
})
INFO:cognitron_vl.data.dataset_base:Successful load datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-OneVision-Data/Geometry3K(MathV360K).jsonl
INFO:cognitron_vl.data.dataset_base:total_num 22589
INFO:cognitron_vl.data.dataset_base:data_ratio 0.1
INFO:cognitron_vl.data.dataset_base:data_num 999999999
INFO:cognitron_vl.data.dataset_base:used_num 2258
INFO:cognitron_vl.data.dataset_base:this_data Dataset({
    features: ['conversations', 'images', 'source', 'videos'],
    num_rows: 2258
})
INFO:cognitron_vl.data.dataset_base:raw_data Dataset({
    features: ['conversations', 'images', 'source', 'videos'],
    num_rows: 942927
})
INFO:cognitron_vl.data.dataset_base:Successful load datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-OneVision-Data/IconQA(MathV360K).jsonl
INFO:cognitron_vl.data.dataset_base:total_num 5225
INFO:cognitron_vl.data.dataset_base:data_ratio 0.1
INFO:cognitron_vl.data.dataset_base:data_num 999999999
INFO:cognitron_vl.data.dataset_base:used_num 522
INFO:cognitron_vl.data.dataset_base:this_data Dataset({
    features: ['conversations', 'images', 'source', 'videos'],
    num_rows: 522
})
INFO:cognitron_vl.data.dataset_base:raw_data Dataset({
    features: ['conversations', 'images', 'source', 'videos'],
    num_rows: 943449
})
INFO:cognitron_vl.data.dataset_base:Successful load datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-OneVision-Data/MapQA(MathV360K).jsonl
INFO:cognitron_vl.data.dataset_base:total_num 35948
INFO:cognitron_vl.data.dataset_base:data_ratio 0.1
INFO:cognitron_vl.data.dataset_base:data_num 999999999
INFO:cognitron_vl.data.dataset_base:used_num 3594
INFO:cognitron_vl.data.dataset_base:this_data Dataset({
    features: ['conversations', 'images', 'source', 'videos'],
    num_rows: 3594
})
INFO:cognitron_vl.data.dataset_base:raw_data Dataset({
    features: ['conversations', 'images', 'source', 'videos'],
    num_rows: 947043
})
INFO:cognitron_vl.data.dataset_base:Successful load datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-OneVision-Data/PMC-VQA(MathV360K).jsonl
INFO:cognitron_vl.data.dataset_base:total_num 8642
INFO:cognitron_vl.data.dataset_base:data_ratio 0.1
INFO:cognitron_vl.data.dataset_base:data_num 999999999
INFO:cognitron_vl.data.dataset_base:used_num 864
INFO:cognitron_vl.data.dataset_base:this_data Dataset({
    features: ['conversations', 'images', 'source', 'videos'],
    num_rows: 864
})
INFO:cognitron_vl.data.dataset_base:raw_data Dataset({
    features: ['conversations', 'images', 'source', 'videos'],
    num_rows: 947907
})
INFO:cognitron_vl.data.dataset_base:Successful load datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-OneVision-Data/Super-CLEVR(MathV360K).jsonl
INFO:cognitron_vl.data.dataset_base:total_num 22452
INFO:cognitron_vl.data.dataset_base:data_ratio 0.1
INFO:cognitron_vl.data.dataset_base:data_num 999999999
INFO:cognitron_vl.data.dataset_base:used_num 2245
INFO:cognitron_vl.data.dataset_base:this_data Dataset({
    features: ['conversations', 'images', 'source', 'videos'],
    num_rows: 2245
})
INFO:cognitron_vl.data.dataset_base:raw_data Dataset({
    features: ['conversations', 'images', 'source', 'videos'],
    num_rows: 950152
})
INFO:cognitron_vl.data.dataset_base:Successful load datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-OneVision-Data/TabMWP(MathV360K).jsonl
INFO:cognitron_vl.data.dataset_base:total_num 11949
INFO:cognitron_vl.data.dataset_base:data_ratio 0.1
INFO:cognitron_vl.data.dataset_base:data_num 999999999
INFO:cognitron_vl.data.dataset_base:used_num 1194
INFO:cognitron_vl.data.dataset_base:this_data Dataset({
    features: ['conversations', 'images', 'source', 'videos'],
    num_rows: 1194
})
INFO:cognitron_vl.data.dataset_base:raw_data Dataset({
    features: ['conversations', 'images', 'source', 'videos'],
    num_rows: 951346
})
INFO:cognitron_vl.data.dataset_base:Successful load datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-OneVision-Data/UniGeo(MathV360K).jsonl
INFO:cognitron_vl.data.dataset_base:total_num 263584
INFO:cognitron_vl.data.dataset_base:data_ratio 0.1
INFO:cognitron_vl.data.dataset_base:data_num 999999999
INFO:cognitron_vl.data.dataset_base:used_num 26358
INFO:cognitron_vl.data.dataset_base:this_data Dataset({
    features: ['conversations', 'images', 'source', 'videos'],
    num_rows: 26358
})
INFO:cognitron_vl.data.dataset_base:raw_data Dataset({
    features: ['conversations', 'images', 'source', 'videos'],
    num_rows: 977704
})
INFO:cognitron_vl.data.dataset_base:Successful load datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-OneVision-Data/VisualWebInstruct(filtered).jsonl
INFO:cognitron_vl.data.dataset_base:total_num 6604
INFO:cognitron_vl.data.dataset_base:data_ratio 0.1
INFO:cognitron_vl.data.dataset_base:data_num 999999999
INFO:cognitron_vl.data.dataset_base:used_num 660
INFO:cognitron_vl.data.dataset_base:this_data Dataset({
    features: ['conversations', 'images', 'source', 'videos'],
    num_rows: 660
})
INFO:cognitron_vl.data.dataset_base:raw_data Dataset({
    features: ['conversations', 'images', 'source', 'videos'],
    num_rows: 978364
})
INFO:cognitron_vl.data.dataset_base:Successful load datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-OneVision-Data/VizWiz(MathV360K).jsonl
INFO:cognitron_vl.data.dataset_base:total_num 2429
INFO:cognitron_vl.data.dataset_base:data_ratio 0.1
INFO:cognitron_vl.data.dataset_base:data_num 999999999
INFO:cognitron_vl.data.dataset_base:used_num 242
INFO:cognitron_vl.data.dataset_base:this_data Dataset({
    features: ['conversations', 'images', 'source', 'videos'],
    num_rows: 242
})
INFO:cognitron_vl.data.dataset_base:raw_data Dataset({
    features: ['conversations', 'images', 'source', 'videos'],
    num_rows: 978606
})
INFO:cognitron_vl.data.dataset_base:Successful load datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-OneVision-Data/ai2d(cauldron,llava_format).jsonl
INFO:cognitron_vl.data.dataset_base:total_num 4864
INFO:cognitron_vl.data.dataset_base:data_ratio 0.1
INFO:cognitron_vl.data.dataset_base:data_num 999999999
INFO:cognitron_vl.data.dataset_base:used_num 486
INFO:cognitron_vl.data.dataset_base:this_data Dataset({
    features: ['conversations', 'images', 'source', 'videos'],
    num_rows: 486
})
INFO:cognitron_vl.data.dataset_base:raw_data Dataset({
    features: ['conversations', 'images', 'source', 'videos'],
    num_rows: 979092
})
INFO:cognitron_vl.data.dataset_base:Successful load datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-OneVision-Data/ai2d(gpt4v).jsonl
INFO:cognitron_vl.data.dataset_base:total_num 12403
INFO:cognitron_vl.data.dataset_base:data_ratio 0.1
INFO:cognitron_vl.data.dataset_base:data_num 999999999
INFO:cognitron_vl.data.dataset_base:used_num 1240
INFO:cognitron_vl.data.dataset_base:this_data Dataset({
    features: ['conversations', 'images', 'source', 'videos'],
    num_rows: 1240
})
INFO:cognitron_vl.data.dataset_base:raw_data Dataset({
    features: ['conversations', 'images', 'source', 'videos'],
    num_rows: 980332
})
INFO:cognitron_vl.data.dataset_base:Successful load datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-OneVision-Data/ai2d(internvl).jsonl
INFO:cognitron_vl.data.dataset_base:total_num 16534
INFO:cognitron_vl.data.dataset_base:data_ratio 0.1
INFO:cognitron_vl.data.dataset_base:data_num 999999999
INFO:cognitron_vl.data.dataset_base:used_num 1653
INFO:cognitron_vl.data.dataset_base:this_data Dataset({
    features: ['conversations', 'images', 'source', 'videos'],
    num_rows: 1653
})
INFO:cognitron_vl.data.dataset_base:raw_data Dataset({
    features: ['conversations', 'images', 'source', 'videos'],
    num_rows: 981985
})
INFO:cognitron_vl.data.dataset_base:Successful load datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-OneVision-Data/aokvqa(cauldron,llava_format).jsonl
INFO:cognitron_vl.data.dataset_base:total_num 83125
INFO:cognitron_vl.data.dataset_base:data_ratio 0.1
INFO:cognitron_vl.data.dataset_base:data_num 999999999
INFO:cognitron_vl.data.dataset_base:used_num 8312
INFO:cognitron_vl.data.dataset_base:this_data Dataset({
    features: ['conversations', 'images', 'source', 'videos'],
    num_rows: 8312
})
INFO:cognitron_vl.data.dataset_base:raw_data Dataset({
    features: ['conversations', 'images', 'source', 'videos'],
    num_rows: 990297
})
INFO:cognitron_vl.data.dataset_base:Successful load datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-OneVision-Data/cambrian(filtered).jsonl
INFO:cognitron_vl.data.dataset_base:total_num 18260
INFO:cognitron_vl.data.dataset_base:data_ratio 0.1
INFO:cognitron_vl.data.dataset_base:data_num 999999999
INFO:cognitron_vl.data.dataset_base:used_num 1826
INFO:cognitron_vl.data.dataset_base:this_data Dataset({
    features: ['conversations', 'images', 'source', 'videos'],
    num_rows: 1826
})
INFO:cognitron_vl.data.dataset_base:raw_data Dataset({
    features: ['conversations', 'images', 'source', 'videos'],
    num_rows: 992123
})
INFO:cognitron_vl.data.dataset_base:Successful load datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-OneVision-Data/chartqa(cauldron,llava_format).jsonl
INFO:cognitron_vl.data.dataset_base:total_num 8825
INFO:cognitron_vl.data.dataset_base:data_ratio 0.1
INFO:cognitron_vl.data.dataset_base:data_num 999999999
INFO:cognitron_vl.data.dataset_base:used_num 882
INFO:cognitron_vl.data.dataset_base:this_data Dataset({
    features: ['conversations', 'images', 'source', 'videos'],
    num_rows: 882
})
INFO:cognitron_vl.data.dataset_base:raw_data Dataset({
    features: ['conversations', 'images', 'source', 'videos'],
    num_rows: 993005
})
INFO:cognitron_vl.data.dataset_base:Successful load datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-OneVision-Data/chrome_writting.jsonl
INFO:cognitron_vl.data.dataset_base:total_num 69995
INFO:cognitron_vl.data.dataset_base:data_ratio 0.1
INFO:cognitron_vl.data.dataset_base:data_num 999999999
INFO:cognitron_vl.data.dataset_base:used_num 6999
INFO:cognitron_vl.data.dataset_base:this_data Dataset({
    features: ['conversations', 'images', 'source', 'videos'],
    num_rows: 6999
})
INFO:cognitron_vl.data.dataset_base:raw_data Dataset({
    features: ['conversations', 'images', 'source', 'videos'],
    num_rows: 1000004
})
INFO:cognitron_vl.data.dataset_base:Successful load datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-OneVision-Data/clevr(cauldron,llava_format).jsonl
INFO:cognitron_vl.data.dataset_base:total_num 199995
INFO:cognitron_vl.data.dataset_base:data_ratio 0.1
INFO:cognitron_vl.data.dataset_base:data_num 999999999
INFO:cognitron_vl.data.dataset_base:used_num 19999
INFO:cognitron_vl.data.dataset_base:this_data Dataset({
    features: ['conversations', 'images', 'source', 'videos'],
    num_rows: 19999
})
INFO:cognitron_vl.data.dataset_base:raw_data Dataset({
    features: ['conversations', 'images', 'source', 'videos'],
    num_rows: 1020003
})
INFO:cognitron_vl.data.dataset_base:Successful load datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-OneVision-Data/dvqa(cauldron,llava_format).jsonl
INFO:cognitron_vl.data.dataset_base:total_num 99995
INFO:cognitron_vl.data.dataset_base:data_ratio 0.1
INFO:cognitron_vl.data.dataset_base:data_num 999999999
INFO:cognitron_vl.data.dataset_base:used_num 9999
INFO:cognitron_vl.data.dataset_base:this_data Dataset({
    features: ['conversations', 'images', 'source', 'videos'],
    num_rows: 9999
})
INFO:cognitron_vl.data.dataset_base:raw_data Dataset({
    features: ['conversations', 'images', 'source', 'videos'],
    num_rows: 1030002
})
INFO:cognitron_vl.data.dataset_base:Successful load datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-OneVision-Data/figureqa(cauldron,llava_format).jsonl
INFO:cognitron_vl.data.dataset_base:total_num 60242
INFO:cognitron_vl.data.dataset_base:data_ratio 0.1
INFO:cognitron_vl.data.dataset_base:data_num 999999999
INFO:cognitron_vl.data.dataset_base:used_num 6024
INFO:cognitron_vl.data.dataset_base:this_data Dataset({
    features: ['conversations', 'images', 'source', 'videos'],
    num_rows: 6024
})
INFO:cognitron_vl.data.dataset_base:raw_data Dataset({
    features: ['conversations', 'images', 'source', 'videos'],
    num_rows: 1036026
})
INFO:cognitron_vl.data.dataset_base:Successful load datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-OneVision-Data/geo170k(align).jsonl
INFO:cognitron_vl.data.dataset_base:total_num 67823
INFO:cognitron_vl.data.dataset_base:data_ratio 0.1
INFO:cognitron_vl.data.dataset_base:data_num 999999999
INFO:cognitron_vl.data.dataset_base:used_num 6782
INFO:cognitron_vl.data.dataset_base:this_data Dataset({
    features: ['conversations', 'images', 'source', 'videos'],
    num_rows: 6782
})
INFO:cognitron_vl.data.dataset_base:raw_data Dataset({
    features: ['conversations', 'images', 'source', 'videos'],
    num_rows: 1042808
})
INFO:cognitron_vl.data.dataset_base:Successful load datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-OneVision-Data/geo170k(qa).jsonl
INFO:cognitron_vl.data.dataset_base:total_num 2091
INFO:cognitron_vl.data.dataset_base:data_ratio 0.1
INFO:cognitron_vl.data.dataset_base:data_num 999999999
INFO:cognitron_vl.data.dataset_base:used_num 209
INFO:cognitron_vl.data.dataset_base:this_data Dataset({
    features: ['conversations', 'images', 'source', 'videos'],
    num_rows: 209
})
INFO:cognitron_vl.data.dataset_base:raw_data Dataset({
    features: ['conversations', 'images', 'source', 'videos'],
    num_rows: 1043017
})
INFO:cognitron_vl.data.dataset_base:Successful load datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-OneVision-Data/geo3k.jsonl
INFO:cognitron_vl.data.dataset_base:total_num 8495
INFO:cognitron_vl.data.dataset_base:data_ratio 0.1
INFO:cognitron_vl.data.dataset_base:data_num 999999999
INFO:cognitron_vl.data.dataset_base:used_num 849
INFO:cognitron_vl.data.dataset_base:this_data Dataset({
    features: ['conversations', 'images', 'source', 'videos'],
    num_rows: 849
})
INFO:cognitron_vl.data.dataset_base:raw_data Dataset({
    features: ['conversations', 'images', 'source', 'videos'],
    num_rows: 1043866
})
INFO:cognitron_vl.data.dataset_base:Successful load datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-OneVision-Data/hateful_memes(cauldron,llava_format).jsonl
INFO:cognitron_vl.data.dataset_base:total_num 2495
INFO:cognitron_vl.data.dataset_base:data_ratio 0.1
INFO:cognitron_vl.data.dataset_base:data_num 999999999
INFO:cognitron_vl.data.dataset_base:used_num 249
INFO:cognitron_vl.data.dataset_base:this_data Dataset({
    features: ['conversations', 'images', 'source', 'videos'],
    num_rows: 249
})
INFO:cognitron_vl.data.dataset_base:raw_data Dataset({
    features: ['conversations', 'images', 'source', 'videos'],
    num_rows: 1044115
})
INFO:cognitron_vl.data.dataset_base:Successful load datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-OneVision-Data/hitab(cauldron,llava_format).jsonl
INFO:cognitron_vl.data.dataset_base:total_num 74492
INFO:cognitron_vl.data.dataset_base:data_ratio 0.1
INFO:cognitron_vl.data.dataset_base:data_num 999999999
INFO:cognitron_vl.data.dataset_base:used_num 7449
INFO:cognitron_vl.data.dataset_base:this_data Dataset({
    features: ['conversations', 'images', 'source', 'videos'],
    num_rows: 7449
})
INFO:cognitron_vl.data.dataset_base:raw_data Dataset({
    features: ['conversations', 'images', 'source', 'videos'],
    num_rows: 1051564
})
INFO:cognitron_vl.data.dataset_base:Successful load datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-OneVision-Data/hme100k.jsonl
INFO:cognitron_vl.data.dataset_base:total_num 27302
INFO:cognitron_vl.data.dataset_base:data_ratio 0.1
INFO:cognitron_vl.data.dataset_base:data_num 999999999
INFO:cognitron_vl.data.dataset_base:used_num 2730
INFO:cognitron_vl.data.dataset_base:this_data Dataset({
    features: ['conversations', 'images', 'source', 'videos'],
    num_rows: 2730
})
INFO:cognitron_vl.data.dataset_base:raw_data Dataset({
    features: ['conversations', 'images', 'source', 'videos'],
    num_rows: 1054294
})
INFO:cognitron_vl.data.dataset_base:Successful load datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-OneVision-Data/iconqa(cauldron,llava_format).jsonl
INFO:cognitron_vl.data.dataset_base:total_num 1990
INFO:cognitron_vl.data.dataset_base:data_ratio 0.1
INFO:cognitron_vl.data.dataset_base:data_num 999999999
INFO:cognitron_vl.data.dataset_base:used_num 199
INFO:cognitron_vl.data.dataset_base:this_data Dataset({
    features: ['conversations', 'images', 'source', 'videos'],
    num_rows: 199
})
INFO:cognitron_vl.data.dataset_base:raw_data Dataset({
    features: ['conversations', 'images', 'source', 'videos'],
    num_rows: 1054493
})
INFO:cognitron_vl.data.dataset_base:Successful load datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-OneVision-Data/iiit5k.jsonl
INFO:cognitron_vl.data.dataset_base:total_num 99573
INFO:cognitron_vl.data.dataset_base:data_ratio 0.1
INFO:cognitron_vl.data.dataset_base:data_num 999999999
INFO:cognitron_vl.data.dataset_base:used_num 9957
INFO:cognitron_vl.data.dataset_base:this_data Dataset({
    features: ['conversations', 'images', 'source', 'videos'],
    num_rows: 9957
})
INFO:cognitron_vl.data.dataset_base:raw_data Dataset({
    features: ['conversations', 'images', 'source', 'videos'],
    num_rows: 1064450
})
INFO:cognitron_vl.data.dataset_base:Successful load datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-OneVision-Data/image_textualization(filtered).jsonl
INFO:cognitron_vl.data.dataset_base:total_num 1982
INFO:cognitron_vl.data.dataset_base:data_ratio 0.1
INFO:cognitron_vl.data.dataset_base:data_num 999999999
INFO:cognitron_vl.data.dataset_base:used_num 198
INFO:cognitron_vl.data.dataset_base:this_data Dataset({
    features: ['conversations', 'images', 'source', 'videos'],
    num_rows: 198
})
INFO:cognitron_vl.data.dataset_base:raw_data Dataset({
    features: ['conversations', 'images', 'source', 'videos'],
    num_rows: 1064648
})
INFO:cognitron_vl.data.dataset_base:Successful load datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-OneVision-Data/infographic(gpt4v).jsonl
INFO:cognitron_vl.data.dataset_base:total_num 4394
INFO:cognitron_vl.data.dataset_base:data_ratio 0.1
INFO:cognitron_vl.data.dataset_base:data_num 999999999
INFO:cognitron_vl.data.dataset_base:used_num 439
INFO:cognitron_vl.data.dataset_base:this_data Dataset({
    features: ['conversations', 'images', 'source', 'videos'],
    num_rows: 439
})
INFO:cognitron_vl.data.dataset_base:raw_data Dataset({
    features: ['conversations', 'images', 'source', 'videos'],
    num_rows: 1065087
})
INFO:cognitron_vl.data.dataset_base:Successful load datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-OneVision-Data/infographic_vqa.jsonl
INFO:cognitron_vl.data.dataset_base:total_num 2113
INFO:cognitron_vl.data.dataset_base:data_ratio 0.1
INFO:cognitron_vl.data.dataset_base:data_num 999999999
INFO:cognitron_vl.data.dataset_base:used_num 211
INFO:cognitron_vl.data.dataset_base:this_data Dataset({
    features: ['conversations', 'images', 'source', 'videos'],
    num_rows: 211
})
INFO:cognitron_vl.data.dataset_base:raw_data Dataset({
    features: ['conversations', 'images', 'source', 'videos'],
    num_rows: 1065298
})
INFO:cognitron_vl.data.dataset_base:Successful load datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-OneVision-Data/infographic_vqa_llava_format.jsonl
INFO:cognitron_vl.data.dataset_base:total_num 256636
INFO:cognitron_vl.data.dataset_base:data_ratio 0.1
INFO:cognitron_vl.data.dataset_base:data_num 999999999
INFO:cognitron_vl.data.dataset_base:used_num 25663
INFO:cognitron_vl.data.dataset_base:this_data Dataset({
    features: ['conversations', 'images', 'source', 'videos'],
    num_rows: 25663
})
INFO:cognitron_vl.data.dataset_base:raw_data Dataset({
    features: ['conversations', 'images', 'source', 'videos'],
    num_rows: 1090961
})
INFO:cognitron_vl.data.dataset_base:Successful load datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-OneVision-Data/k12_printing.jsonl
INFO:cognitron_vl.data.dataset_base:total_num 19790
INFO:cognitron_vl.data.dataset_base:data_ratio 0.1
INFO:cognitron_vl.data.dataset_base:data_num 999999999
INFO:cognitron_vl.data.dataset_base:used_num 1979
INFO:cognitron_vl.data.dataset_base:this_data Dataset({
    features: ['conversations', 'images', 'source', 'videos'],
    num_rows: 1979
})
INFO:cognitron_vl.data.dataset_base:raw_data Dataset({
    features: ['conversations', 'images', 'source', 'videos'],
    num_rows: 1092940
})
INFO:cognitron_vl.data.dataset_base:Successful load datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-OneVision-Data/llavar_gpt4_20k.jsonl
INFO:cognitron_vl.data.dataset_base:total_num 1776
INFO:cognitron_vl.data.dataset_base:data_ratio 0.1
INFO:cognitron_vl.data.dataset_base:data_num 999999999
INFO:cognitron_vl.data.dataset_base:used_num 177
INFO:cognitron_vl.data.dataset_base:this_data Dataset({
    features: ['conversations', 'images', 'source', 'videos'],
    num_rows: 177
})
INFO:cognitron_vl.data.dataset_base:raw_data Dataset({
    features: ['conversations', 'images', 'source', 'videos'],
    num_rows: 1093117
})
INFO:cognitron_vl.data.dataset_base:Successful load datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-OneVision-Data/lrv_chart.jsonl
INFO:cognitron_vl.data.dataset_base:total_num 10490
INFO:cognitron_vl.data.dataset_base:data_ratio 0.1
INFO:cognitron_vl.data.dataset_base:data_num 999999999
INFO:cognitron_vl.data.dataset_base:used_num 1049
INFO:cognitron_vl.data.dataset_base:this_data Dataset({
    features: ['conversations', 'images', 'source', 'videos'],
    num_rows: 1049
})
INFO:cognitron_vl.data.dataset_base:raw_data Dataset({
    features: ['conversations', 'images', 'source', 'videos'],
    num_rows: 1094166
})
INFO:cognitron_vl.data.dataset_base:Successful load datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-OneVision-Data/lrv_normal(filtered).jsonl
INFO:cognitron_vl.data.dataset_base:total_num 299988
INFO:cognitron_vl.data.dataset_base:data_ratio 0.1
INFO:cognitron_vl.data.dataset_base:data_num 999999999
INFO:cognitron_vl.data.dataset_base:used_num 29998
INFO:cognitron_vl.data.dataset_base:this_data Dataset({
    features: ['conversations', 'images', 'source', 'videos'],
    num_rows: 29998
})
INFO:cognitron_vl.data.dataset_base:raw_data Dataset({
    features: ['conversations', 'images', 'source', 'videos'],
    num_rows: 1124164
})
INFO:cognitron_vl.data.dataset_base:Successful load datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-OneVision-Data/magpie_pro(l3_80b_mt).jsonl
INFO:cognitron_vl.data.dataset_base:total_num 299990
INFO:cognitron_vl.data.dataset_base:data_ratio 0.1
INFO:cognitron_vl.data.dataset_base:data_num 999999999
INFO:cognitron_vl.data.dataset_base:used_num 29999
INFO:cognitron_vl.data.dataset_base:this_data Dataset({
    features: ['conversations', 'images', 'source', 'videos'],
    num_rows: 29999
})
INFO:cognitron_vl.data.dataset_base:raw_data Dataset({
    features: ['conversations', 'images', 'source', 'videos'],
    num_rows: 1154163
})
INFO:cognitron_vl.data.dataset_base:Successful load datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-OneVision-Data/magpie_pro(l3_80b_st).jsonl
INFO:cognitron_vl.data.dataset_base:total_num 299982
INFO:cognitron_vl.data.dataset_base:data_ratio 0.1
INFO:cognitron_vl.data.dataset_base:data_num 999999999
INFO:cognitron_vl.data.dataset_base:used_num 29998
INFO:cognitron_vl.data.dataset_base:this_data Dataset({
    features: ['conversations', 'images', 'source', 'videos'],
    num_rows: 29998
})
INFO:cognitron_vl.data.dataset_base:raw_data Dataset({
    features: ['conversations', 'images', 'source', 'videos'],
    num_rows: 1184161
})
INFO:cognitron_vl.data.dataset_base:Successful load datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-OneVision-Data/magpie_pro(qwen2_72b_st).jsonl
INFO:cognitron_vl.data.dataset_base:total_num 37412
INFO:cognitron_vl.data.dataset_base:data_ratio 0.1
INFO:cognitron_vl.data.dataset_base:data_num 999999999
INFO:cognitron_vl.data.dataset_base:used_num 3741
INFO:cognitron_vl.data.dataset_base:this_data Dataset({
    features: ['conversations', 'images', 'source', 'videos'],
    num_rows: 3741
})
INFO:cognitron_vl.data.dataset_base:raw_data Dataset({
    features: ['conversations', 'images', 'source', 'videos'],
    num_rows: 1187902
})
INFO:cognitron_vl.data.dataset_base:Successful load datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-OneVision-Data/mapqa(cauldron,llava_format).jsonl
INFO:cognitron_vl.data.dataset_base:total_num 29827
INFO:cognitron_vl.data.dataset_base:data_ratio 0.1
INFO:cognitron_vl.data.dataset_base:data_num 999999999
INFO:cognitron_vl.data.dataset_base:used_num 2982
INFO:cognitron_vl.data.dataset_base:this_data Dataset({
    features: ['conversations', 'images', 'source', 'videos'],
    num_rows: 2982
})
INFO:cognitron_vl.data.dataset_base:raw_data Dataset({
    features: ['conversations', 'images', 'source', 'videos'],
    num_rows: 1190884
})
INFO:cognitron_vl.data.dataset_base:Successful load datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-OneVision-Data/mathqa.jsonl
INFO:cognitron_vl.data.dataset_base:total_num 87348
INFO:cognitron_vl.data.dataset_base:data_ratio 0.1
INFO:cognitron_vl.data.dataset_base:data_num 999999999
INFO:cognitron_vl.data.dataset_base:used_num 8734
INFO:cognitron_vl.data.dataset_base:this_data Dataset({
    features: ['conversations', 'images', 'source', 'videos'],
    num_rows: 8734
})
INFO:cognitron_vl.data.dataset_base:raw_data Dataset({
    features: ['conversations', 'images', 'source', 'videos'],
    num_rows: 1199618
})
INFO:cognitron_vl.data.dataset_base:Successful load datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-OneVision-Data/mavis_math_metagen.jsonl
INFO:cognitron_vl.data.dataset_base:total_num 99990
INFO:cognitron_vl.data.dataset_base:data_ratio 0.1
INFO:cognitron_vl.data.dataset_base:data_num 999999999
INFO:cognitron_vl.data.dataset_base:used_num 9999
INFO:cognitron_vl.data.dataset_base:this_data Dataset({
    features: ['conversations', 'images', 'source', 'videos'],
    num_rows: 9999
})
INFO:cognitron_vl.data.dataset_base:raw_data Dataset({
    features: ['conversations', 'images', 'source', 'videos'],
    num_rows: 1209617
})
INFO:cognitron_vl.data.dataset_base:Successful load datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-OneVision-Data/mavis_math_rule_geo.jsonl
INFO:cognitron_vl.data.dataset_base:total_num 1999
INFO:cognitron_vl.data.dataset_base:data_ratio 0.1
INFO:cognitron_vl.data.dataset_base:data_num 999999999
INFO:cognitron_vl.data.dataset_base:used_num 199
INFO:cognitron_vl.data.dataset_base:this_data Dataset({
    features: ['conversations', 'images', 'source', 'videos'],
    num_rows: 199
})
INFO:cognitron_vl.data.dataset_base:raw_data Dataset({
    features: ['conversations', 'images', 'source', 'videos'],
    num_rows: 1209816
})
INFO:cognitron_vl.data.dataset_base:Successful load datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-OneVision-Data/orand_car_a.jsonl
INFO:cognitron_vl.data.dataset_base:total_num 38241
INFO:cognitron_vl.data.dataset_base:data_ratio 0.1
INFO:cognitron_vl.data.dataset_base:data_num 999999999
INFO:cognitron_vl.data.dataset_base:used_num 3824
INFO:cognitron_vl.data.dataset_base:this_data Dataset({
    features: ['conversations', 'images', 'source', 'videos'],
    num_rows: 3824
})
INFO:cognitron_vl.data.dataset_base:raw_data Dataset({
    features: ['conversations', 'images', 'source', 'videos'],
    num_rows: 1213640
})
INFO:cognitron_vl.data.dataset_base:Successful load datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-OneVision-Data/robut_wtq(cauldron,llava_format).jsonl
INFO:cognitron_vl.data.dataset_base:total_num 4971
INFO:cognitron_vl.data.dataset_base:data_ratio 0.1
INFO:cognitron_vl.data.dataset_base:data_num 999999999
INFO:cognitron_vl.data.dataset_base:used_num 497
INFO:cognitron_vl.data.dataset_base:this_data Dataset({
    features: ['conversations', 'images', 'source', 'videos'],
    num_rows: 497
})
INFO:cognitron_vl.data.dataset_base:raw_data Dataset({
    features: ['conversations', 'images', 'source', 'videos'],
    num_rows: 1214137
})
INFO:cognitron_vl.data.dataset_base:Successful load datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-OneVision-Data/scienceqa(cauldron,llava_format).jsonl
INFO:cognitron_vl.data.dataset_base:total_num 19208
INFO:cognitron_vl.data.dataset_base:data_ratio 0.1
INFO:cognitron_vl.data.dataset_base:data_num 999999999
INFO:cognitron_vl.data.dataset_base:used_num 1920
INFO:cognitron_vl.data.dataset_base:this_data Dataset({
    features: ['conversations', 'images', 'source', 'videos'],
    num_rows: 1920
})
INFO:cognitron_vl.data.dataset_base:raw_data Dataset({
    features: ['conversations', 'images', 'source', 'videos'],
    num_rows: 1216057
})
INFO:cognitron_vl.data.dataset_base:Successful load datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-OneVision-Data/scienceqa(nona_context).jsonl
INFO:cognitron_vl.data.dataset_base:total_num 57284
INFO:cognitron_vl.data.dataset_base:data_ratio 0.1
INFO:cognitron_vl.data.dataset_base:data_num 999999999
INFO:cognitron_vl.data.dataset_base:used_num 5728
INFO:cognitron_vl.data.dataset_base:this_data Dataset({
    features: ['conversations', 'images', 'source', 'videos'],
    num_rows: 5728
})
INFO:cognitron_vl.data.dataset_base:raw_data Dataset({
    features: ['conversations', 'images', 'source', 'videos'],
    num_rows: 1221785
})
INFO:cognitron_vl.data.dataset_base:Successful load datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-OneVision-Data/sharegpt4o.jsonl
INFO:cognitron_vl.data.dataset_base:total_num 33616
INFO:cognitron_vl.data.dataset_base:data_ratio 0.1
INFO:cognitron_vl.data.dataset_base:data_num 999999999
INFO:cognitron_vl.data.dataset_base:used_num 3361
INFO:cognitron_vl.data.dataset_base:this_data Dataset({
    features: ['conversations', 'images', 'source', 'videos'],
    num_rows: 3361
})
INFO:cognitron_vl.data.dataset_base:raw_data Dataset({
    features: ['conversations', 'images', 'source', 'videos'],
    num_rows: 1225146
})
INFO:cognitron_vl.data.dataset_base:Successful load datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-OneVision-Data/sroie.jsonl
INFO:cognitron_vl.data.dataset_base:total_num 17242
INFO:cognitron_vl.data.dataset_base:data_ratio 0.1
INFO:cognitron_vl.data.dataset_base:data_num 999999999
INFO:cognitron_vl.data.dataset_base:used_num 1724
INFO:cognitron_vl.data.dataset_base:this_data Dataset({
    features: ['conversations', 'images', 'source', 'videos'],
    num_rows: 1724
})
INFO:cognitron_vl.data.dataset_base:raw_data Dataset({
    features: ['conversations', 'images', 'source', 'videos'],
    num_rows: 1226870
})
INFO:cognitron_vl.data.dataset_base:Successful load datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-OneVision-Data/st_vqa(cauldron,llava_format).jsonl
INFO:cognitron_vl.data.dataset_base:total_num 98675
INFO:cognitron_vl.data.dataset_base:data_ratio 0.1
INFO:cognitron_vl.data.dataset_base:data_num 999999999
INFO:cognitron_vl.data.dataset_base:used_num 9867
INFO:cognitron_vl.data.dataset_base:this_data Dataset({
    features: ['conversations', 'images', 'source', 'videos'],
    num_rows: 9867
})
INFO:cognitron_vl.data.dataset_base:raw_data Dataset({
    features: ['conversations', 'images', 'source', 'videos'],
    num_rows: 1236737
})
INFO:cognitron_vl.data.dataset_base:Successful load datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-OneVision-Data/tallyqa(cauldron,llava_format).jsonl
INFO:cognitron_vl.data.dataset_base:total_num 21942
INFO:cognitron_vl.data.dataset_base:data_ratio 0.1
INFO:cognitron_vl.data.dataset_base:data_num 999999999
INFO:cognitron_vl.data.dataset_base:used_num 2194
INFO:cognitron_vl.data.dataset_base:this_data Dataset({
    features: ['conversations', 'images', 'source', 'videos'],
    num_rows: 2194
})
INFO:cognitron_vl.data.dataset_base:raw_data Dataset({
    features: ['conversations', 'images', 'source', 'videos'],
    num_rows: 1238931
})
INFO:cognitron_vl.data.dataset_base:Successful load datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-OneVision-Data/textcaps.jsonl
INFO:cognitron_vl.data.dataset_base:total_num 25104
INFO:cognitron_vl.data.dataset_base:data_ratio 0.1
INFO:cognitron_vl.data.dataset_base:data_num 999999999
INFO:cognitron_vl.data.dataset_base:used_num 2510
INFO:cognitron_vl.data.dataset_base:this_data Dataset({
    features: ['conversations', 'images', 'source', 'videos'],
    num_rows: 2510
})
INFO:cognitron_vl.data.dataset_base:raw_data Dataset({
    features: ['conversations', 'images', 'source', 'videos'],
    num_rows: 1241441
})
INFO:cognitron_vl.data.dataset_base:Successful load datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-OneVision-Data/textocr(gpt4v).jsonl
INFO:cognitron_vl.data.dataset_base:total_num 27302
INFO:cognitron_vl.data.dataset_base:data_ratio 0.1
INFO:cognitron_vl.data.dataset_base:data_num 999999999
INFO:cognitron_vl.data.dataset_base:used_num 2730
INFO:cognitron_vl.data.dataset_base:this_data Dataset({
    features: ['conversations', 'images', 'source', 'videos'],
    num_rows: 2730
})
INFO:cognitron_vl.data.dataset_base:raw_data Dataset({
    features: ['conversations', 'images', 'source', 'videos'],
    num_rows: 1244171
})
INFO:cognitron_vl.data.dataset_base:Successful load datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-OneVision-Data/tqa(cauldron,llava_format).jsonl
INFO:cognitron_vl.data.dataset_base:total_num 91434
INFO:cognitron_vl.data.dataset_base:data_ratio 0.1
INFO:cognitron_vl.data.dataset_base:data_num 999999999
INFO:cognitron_vl.data.dataset_base:used_num 9143
INFO:cognitron_vl.data.dataset_base:this_data Dataset({
    features: ['conversations', 'images', 'source', 'videos'],
    num_rows: 9143
})
INFO:cognitron_vl.data.dataset_base:raw_data Dataset({
    features: ['conversations', 'images', 'source', 'videos'],
    num_rows: 1253314
})
INFO:cognitron_vl.data.dataset_base:Successful load datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-OneVision-Data/ureader_cap.jsonl
INFO:cognitron_vl.data.dataset_base:total_num 17322
INFO:cognitron_vl.data.dataset_base:data_ratio 0.1
INFO:cognitron_vl.data.dataset_base:data_num 999999999
INFO:cognitron_vl.data.dataset_base:used_num 1732
INFO:cognitron_vl.data.dataset_base:this_data Dataset({
    features: ['conversations', 'images', 'source', 'videos'],
    num_rows: 1732
})
INFO:cognitron_vl.data.dataset_base:raw_data Dataset({
    features: ['conversations', 'images', 'source', 'videos'],
    num_rows: 1255046
})
INFO:cognitron_vl.data.dataset_base:Successful load datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-OneVision-Data/ureader_ie.jsonl
INFO:cognitron_vl.data.dataset_base:total_num 37550
INFO:cognitron_vl.data.dataset_base:data_ratio 0.1
INFO:cognitron_vl.data.dataset_base:data_num 999999999
INFO:cognitron_vl.data.dataset_base:used_num 3755
INFO:cognitron_vl.data.dataset_base:this_data Dataset({
    features: ['conversations', 'images', 'source', 'videos'],
    num_rows: 3755
})
INFO:cognitron_vl.data.dataset_base:raw_data Dataset({
    features: ['conversations', 'images', 'source', 'videos'],
    num_rows: 1258801
})
INFO:cognitron_vl.data.dataset_base:Successful load datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-OneVision-Data/ureader_kg.jsonl
INFO:cognitron_vl.data.dataset_base:total_num 252954
INFO:cognitron_vl.data.dataset_base:data_ratio 0.1
INFO:cognitron_vl.data.dataset_base:data_num 999999999
INFO:cognitron_vl.data.dataset_base:used_num 25295
INFO:cognitron_vl.data.dataset_base:this_data Dataset({
    features: ['conversations', 'images', 'source', 'videos'],
    num_rows: 25295
})
INFO:cognitron_vl.data.dataset_base:raw_data Dataset({
    features: ['conversations', 'images', 'source', 'videos'],
    num_rows: 1284096
})
INFO:cognitron_vl.data.dataset_base:Successful load datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-OneVision-Data/ureader_qa.jsonl
INFO:cognitron_vl.data.dataset_base:total_num 186060
INFO:cognitron_vl.data.dataset_base:data_ratio 0.1
INFO:cognitron_vl.data.dataset_base:data_num 999999999
INFO:cognitron_vl.data.dataset_base:used_num 18606
INFO:cognitron_vl.data.dataset_base:this_data Dataset({
    features: ['conversations', 'images', 'source', 'videos'],
    num_rows: 18606
})
INFO:cognitron_vl.data.dataset_base:raw_data Dataset({
    features: ['conversations', 'images', 'source', 'videos'],
    num_rows: 1302702
})
INFO:cognitron_vl.data.dataset_base:Successful load datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-OneVision-Data/vision_flan(filtered).jsonl
INFO:cognitron_vl.data.dataset_base:total_num 14361
INFO:cognitron_vl.data.dataset_base:data_ratio 0.1
INFO:cognitron_vl.data.dataset_base:data_num 999999999
INFO:cognitron_vl.data.dataset_base:used_num 1436
INFO:cognitron_vl.data.dataset_base:this_data Dataset({
    features: ['conversations', 'images', 'source', 'videos'],
    num_rows: 1436
})
INFO:cognitron_vl.data.dataset_base:raw_data Dataset({
    features: ['conversations', 'images', 'source', 'videos'],
    num_rows: 1304138
})
INFO:cognitron_vl.data.dataset_base:Successful load datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-OneVision-Data/visual7w(cauldron,llava_format).jsonl
INFO:cognitron_vl.data.dataset_base:total_num 308
INFO:cognitron_vl.data.dataset_base:data_ratio 0.1
INFO:cognitron_vl.data.dataset_base:data_num 999999999
INFO:cognitron_vl.data.dataset_base:used_num 30
INFO:cognitron_vl.data.dataset_base:this_data Dataset({
    features: ['conversations', 'images', 'source', 'videos'],
    num_rows: 30
})
INFO:cognitron_vl.data.dataset_base:raw_data Dataset({
    features: ['conversations', 'images', 'source', 'videos'],
    num_rows: 1304168
})
INFO:cognitron_vl.data.dataset_base:Successful load datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-OneVision-Data/vqarad(cauldron,llava_format).jsonl
INFO:cognitron_vl.data.dataset_base:total_num 2152
INFO:cognitron_vl.data.dataset_base:data_ratio 0.1
INFO:cognitron_vl.data.dataset_base:data_num 999999999
INFO:cognitron_vl.data.dataset_base:used_num 215
INFO:cognitron_vl.data.dataset_base:this_data Dataset({
    features: ['conversations', 'images', 'source', 'videos'],
    num_rows: 215
})
INFO:cognitron_vl.data.dataset_base:raw_data Dataset({
    features: ['conversations', 'images', 'source', 'videos'],
    num_rows: 1304383
})
INFO:cognitron_vl.data.dataset_base:Successful load datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-OneVision-Data/vsr(cauldron,llava_format).jsonl
INFO:cognitron_vl.data.dataset_base:total_num 605719
INFO:cognitron_vl.data.dataset_base:data_ratio 0.1
INFO:cognitron_vl.data.dataset_base:data_num 999999999
INFO:cognitron_vl.data.dataset_base:used_num 60571
INFO:cognitron_vl.data.dataset_base:this_data Dataset({
    features: ['conversations', 'images', 'source', 'videos'],
    num_rows: 60571
})
INFO:cognitron_vl.data.dataset_base:raw_data Dataset({
    features: ['conversations', 'images', 'source', 'videos'],
    num_rows: 1364954
})
INFO:cognitron_vl.data.dataset_base:Successful load datasets/LMM/lcvlm_jsonl/lmms-lab/M4-Instruct-Data/m4_instruct_annotations.jsonl
INFO:cognitron_vl.data.dataset_base:total_num 254999
INFO:cognitron_vl.data.dataset_base:data_ratio 0.1
INFO:cognitron_vl.data.dataset_base:data_num 999999999
INFO:cognitron_vl.data.dataset_base:used_num 25499
INFO:cognitron_vl.data.dataset_base:this_data Dataset({
    features: ['conversations', 'videos', 'source', 'images'],
    num_rows: 25499
})
INFO:cognitron_vl.data.dataset_base:raw_data Dataset({
    features: ['conversations', 'images', 'source', 'videos'],
    num_rows: 1390453
})
INFO:cognitron_vl.data.dataset_base:Successful load datasets/LMM/lcvlm_jsonl/lmms-lab/M4-Instruct-Data/m4_instruct_video.jsonl
INFO:cognitron_vl.data.dataset_base:total_num 1001551
INFO:cognitron_vl.data.dataset_base:data_ratio 0.1
INFO:cognitron_vl.data.dataset_base:data_num 300000
INFO:cognitron_vl.data.dataset_base:used_num 100155
INFO:cognitron_vl.data.dataset_base:this_data Dataset({
    features: ['conversations', 'source', 'images', 'videos'],
    num_rows: 100155
})
INFO:cognitron_vl.data.dataset_base:raw_data Dataset({
    features: ['conversations', 'images', 'source', 'videos'],
    num_rows: 1490608
})
INFO:cognitron_vl.data.dataset_base:Successful load datasets/LLM/jsons/teknium/OpenHermes-2.5/openhermes2_5.jsonl
INFO:cognitron_vl.data.dataset_base:total_num 1030
INFO:cognitron_vl.data.dataset_base:data_ratio 0.1
INFO:cognitron_vl.data.dataset_base:data_num 999999999
INFO:cognitron_vl.data.dataset_base:used_num 103
INFO:cognitron_vl.data.dataset_base:this_data Dataset({
    features: ['conversations', 'source', 'images', 'videos'],
    num_rows: 103
})
INFO:cognitron_vl.data.dataset_base:raw_data Dataset({
    features: ['conversations', 'images', 'source', 'videos'],
    num_rows: 1490711
})
INFO:cognitron_vl.data.dataset_base:Successful load datasets/LLM/jsons/GAIR/lima/train.jsonl
INFO:cognitron_vl.data.dataset_base:total_num 15011
INFO:cognitron_vl.data.dataset_base:data_ratio 0.1
INFO:cognitron_vl.data.dataset_base:data_num 999999999
INFO:cognitron_vl.data.dataset_base:used_num 1501
INFO:cognitron_vl.data.dataset_base:this_data Dataset({
    features: ['conversations', 'source', 'images', 'videos'],
    num_rows: 1501
})
INFO:cognitron_vl.data.dataset_base:raw_data Dataset({
    features: ['conversations', 'images', 'source', 'videos'],
    num_rows: 1492212
})
INFO:cognitron_vl.data.dataset_base:Successful load datasets/LLM/jsons/databricks/databricks-dolly-15k/databricks-dolly-15k.jsonl
INFO:cognitron_vl.data.dataset_base:total_num 395000
INFO:cognitron_vl.data.dataset_base:data_ratio 0.1
INFO:cognitron_vl.data.dataset_base:data_num 100000
INFO:cognitron_vl.data.dataset_base:used_num 39500
INFO:cognitron_vl.data.dataset_base:this_data Dataset({
    features: ['conversations', 'source', 'images', 'videos'],
    num_rows: 39500
})
INFO:cognitron_vl.data.dataset_base:raw_data Dataset({
    features: ['conversations', 'images', 'source', 'videos'],
    num_rows: 1531712
})
INFO:cognitron_vl.data.dataset_base:Successful load datasets/LLM/jsons/meta-math/MetaMathQA/MetaMathQA-395K.jsonl
INFO:cognitron_vl.data.dataset_base:total_num 262039
INFO:cognitron_vl.data.dataset_base:data_ratio 0.1
INFO:cognitron_vl.data.dataset_base:data_num 80000
INFO:cognitron_vl.data.dataset_base:used_num 26203
INFO:cognitron_vl.data.dataset_base:this_data Dataset({
    features: ['conversations', 'source', 'images', 'videos'],
    num_rows: 26203
})
INFO:cognitron_vl.data.dataset_base:raw_data Dataset({
    features: ['conversations', 'images', 'source', 'videos'],
    num_rows: 1557915
})
INFO:cognitron_vl.data.dataset_base:Successful load datasets/LLM/jsons/TIGER-Lab/MathInstruct/MathInstruct.jsonl
INFO:cognitron_vl.data.dataset_base:total_num 200035
INFO:cognitron_vl.data.dataset_base:data_ratio 0.1
INFO:cognitron_vl.data.dataset_base:data_num 50000
INFO:cognitron_vl.data.dataset_base:used_num 20003
INFO:cognitron_vl.data.dataset_base:this_data Dataset({
    features: ['conversations', 'source', 'images', 'videos'],
    num_rows: 20003
})
INFO:cognitron_vl.data.dataset_base:raw_data Dataset({
    features: ['conversations', 'images', 'source', 'videos'],
    num_rows: 1577918
})
INFO:cognitron_vl.data.dataset_base:Successful load datasets/LLM/jsons/microsoft/orca-math-word-problems-200k/data.jsonl
INFO:cognitron_vl.data.dataset_base:total_num 17807579
INFO:cognitron_vl.data.dataset_base:data_ratio 0.1
INFO:cognitron_vl.data.dataset_base:data_num 100000
INFO:cognitron_vl.data.dataset_base:used_num 100000
INFO:cognitron_vl.data.dataset_base:this_data Dataset({
    features: ['conversations', 'source', 'images', 'videos'],
    num_rows: 100000
})
INFO:cognitron_vl.data.dataset_base:raw_data Dataset({
    features: ['conversations', 'images', 'source', 'videos'],
    num_rows: 1677918
})
INFO:cognitron_vl.data.dataset_base:Successful load datasets/LLM/jsons/AtlasUnified/atlas-math-sets/train.jsonl
INFO:cognitron_vl.data.dataset_base:total_num 1746300
INFO:cognitron_vl.data.dataset_base:data_ratio 0.1
INFO:cognitron_vl.data.dataset_base:data_num 30000
INFO:cognitron_vl.data.dataset_base:used_num 30000
INFO:cognitron_vl.data.dataset_base:this_data Dataset({
    features: ['conversations', 'source', 'images', 'videos'],
    num_rows: 30000
})
INFO:cognitron_vl.data.dataset_base:raw_data Dataset({
    features: ['conversations', 'images', 'source', 'videos'],
    num_rows: 1707918
})
INFO:cognitron_vl.data.dataset_base:Successful load datasets/LLM/jsons/tiedong/goat/dataset.jsonl
INFO:cognitron_vl.data.dataset_base:total_num 50000
INFO:cognitron_vl.data.dataset_base:data_ratio 0.1
INFO:cognitron_vl.data.dataset_base:data_num 30000
INFO:cognitron_vl.data.dataset_base:used_num 5000
INFO:cognitron_vl.data.dataset_base:this_data Dataset({
    features: ['conversations', 'source', 'images', 'videos'],
    num_rows: 5000
})
INFO:cognitron_vl.data.dataset_base:raw_data Dataset({
    features: ['conversations', 'images', 'source', 'videos'],
    num_rows: 1712918
})
INFO:cognitron_vl.data.dataset_base:Successful load datasets/LLM/jsons/camel-ai/math/math.jsonl
INFO:cognitron_vl.data.dataset_base:total_num 1214
INFO:cognitron_vl.data.dataset_base:data_ratio 1.0
INFO:cognitron_vl.data.dataset_base:data_num 999999999
INFO:cognitron_vl.data.dataset_base:used_num 1214
INFO:cognitron_vl.data.dataset_base:this_data Dataset({
    features: ['conversations', 'source', 'images', 'videos'],
    num_rows: 1214
})
INFO:cognitron_vl.data.dataset_base:raw_data Dataset({
    features: ['conversations', 'images', 'source', 'videos'],
    num_rows: 1714132
})
INFO:cognitron_vl.data.dataset_base:Successful load datasets/LLM/jsons/yuyijiong/Long-Instruction-with-Paraphrasing/booksum_en.jsonl
INFO:cognitron_vl.data.dataset_base:total_num 3439
INFO:cognitron_vl.data.dataset_base:data_ratio 1.0
INFO:cognitron_vl.data.dataset_base:data_num 999999999
INFO:cognitron_vl.data.dataset_base:used_num 3439
INFO:cognitron_vl.data.dataset_base:this_data Dataset({
    features: ['conversations', 'source', 'images', 'videos'],
    num_rows: 3439
})
INFO:cognitron_vl.data.dataset_base:raw_data Dataset({
    features: ['conversations', 'images', 'source', 'videos'],
    num_rows: 1717571
})
INFO:cognitron_vl.data.dataset_base:Successful load datasets/LLM/jsons/yuyijiong/Long-Instruction-with-Paraphrasing/multi_doc_qa_en_paraphrasing.jsonl
INFO:cognitron_vl.data.dataset_base:total_num 1336
INFO:cognitron_vl.data.dataset_base:data_ratio 1.0
INFO:cognitron_vl.data.dataset_base:data_num 999999999
INFO:cognitron_vl.data.dataset_base:used_num 1336
INFO:cognitron_vl.data.dataset_base:this_data Dataset({
    features: ['conversations', 'source', 'images', 'videos'],
    num_rows: 1336
})
INFO:cognitron_vl.data.dataset_base:raw_data Dataset({
    features: ['conversations', 'images', 'source', 'videos'],
    num_rows: 1718907
})
INFO:cognitron_vl.data.dataset_base:Successful load datasets/LLM/jsons/yuyijiong/Long-Instruction-with-Paraphrasing/sharegpt_en.jsonl
INFO:cognitron_vl.data.dataset_base:total_num 1574
INFO:cognitron_vl.data.dataset_base:data_ratio 1.0
INFO:cognitron_vl.data.dataset_base:data_num 999999999
INFO:cognitron_vl.data.dataset_base:used_num 1574
INFO:cognitron_vl.data.dataset_base:this_data Dataset({
    features: ['conversations', 'source', 'images', 'videos'],
    num_rows: 1574
})
INFO:cognitron_vl.data.dataset_base:raw_data Dataset({
    features: ['conversations', 'images', 'source', 'videos'],
    num_rows: 1720481
})
INFO:cognitron_vl.data.dataset_base:Successful load datasets/LLM/jsons/yuyijiong/Long-Instruction-with-Paraphrasing/short_instruction_from_alpaca_en.jsonl
INFO:cognitron_vl.data.dataset_base:total_num 733
INFO:cognitron_vl.data.dataset_base:data_ratio 1.0
INFO:cognitron_vl.data.dataset_base:data_num 999999999
INFO:cognitron_vl.data.dataset_base:used_num 733
INFO:cognitron_vl.data.dataset_base:this_data Dataset({
    features: ['conversations', 'source', 'images', 'videos'],
    num_rows: 733
})
INFO:cognitron_vl.data.dataset_base:raw_data Dataset({
    features: ['conversations', 'images', 'source', 'videos'],
    num_rows: 1721214
})
INFO:cognitron_vl.data.dataset_base:Successful load datasets/LLM/jsons/yuyijiong/Long-Instruction-with-Paraphrasing/single_doc_qa_en_paraphrasing.jsonl
INFO:cognitron_vl.data.dataset_base:total_num 677
INFO:cognitron_vl.data.dataset_base:data_ratio 1.0
INFO:cognitron_vl.data.dataset_base:data_num 999999999
INFO:cognitron_vl.data.dataset_base:used_num 677
INFO:cognitron_vl.data.dataset_base:this_data Dataset({
    features: ['conversations', 'source', 'images', 'videos'],
    num_rows: 677
})
INFO:cognitron_vl.data.dataset_base:raw_data Dataset({
    features: ['conversations', 'images', 'source', 'videos'],
    num_rows: 1721891
})
INFO:cognitron_vl.data.dataset_base:Successful load datasets/LLM/jsons/yuyijiong/Long-Instruction-with-Paraphrasing/translation_en2zh.jsonl
INFO:cognitron_vl.data.dataset_base:total_num 460
INFO:cognitron_vl.data.dataset_base:data_ratio 1.0
INFO:cognitron_vl.data.dataset_base:data_num 999999999
INFO:cognitron_vl.data.dataset_base:used_num 460
INFO:cognitron_vl.data.dataset_base:this_data Dataset({
    features: ['conversations', 'source', 'images', 'videos'],
    num_rows: 460
})
INFO:cognitron_vl.data.dataset_base:raw_data Dataset({
    features: ['conversations', 'images', 'source', 'videos'],
    num_rows: 1722351
})
INFO:cognitron_vl.data.dataset_base:Successful load datasets/LLM/jsons/yuyijiong/Long-Instruction-with-Paraphrasing/booksum_zh.jsonl
INFO:cognitron_vl.data.dataset_base:total_num 4262
INFO:cognitron_vl.data.dataset_base:data_ratio 1.0
INFO:cognitron_vl.data.dataset_base:data_num 999999999
INFO:cognitron_vl.data.dataset_base:used_num 4262
INFO:cognitron_vl.data.dataset_base:this_data Dataset({
    features: ['conversations', 'source', 'images', 'videos'],
    num_rows: 4262
})
INFO:cognitron_vl.data.dataset_base:raw_data Dataset({
    features: ['conversations', 'images', 'source', 'videos'],
    num_rows: 1726613
})
INFO:cognitron_vl.data.dataset_base:Successful load datasets/LLM/jsons/yuyijiong/Long-Instruction-with-Paraphrasing/multi_doc_qa_zh_paraphrasing.jsonl
INFO:cognitron_vl.data.dataset_base:total_num 842
INFO:cognitron_vl.data.dataset_base:data_ratio 1.0
INFO:cognitron_vl.data.dataset_base:data_num 999999999
INFO:cognitron_vl.data.dataset_base:used_num 842
INFO:cognitron_vl.data.dataset_base:this_data Dataset({
    features: ['conversations', 'source', 'images', 'videos'],
    num_rows: 842
})
INFO:cognitron_vl.data.dataset_base:raw_data Dataset({
    features: ['conversations', 'images', 'source', 'videos'],
    num_rows: 1727455
})
INFO:cognitron_vl.data.dataset_base:Successful load datasets/LLM/jsons/yuyijiong/Long-Instruction-with-Paraphrasing/sharegpt_zh.jsonl
INFO:cognitron_vl.data.dataset_base:total_num 1138
INFO:cognitron_vl.data.dataset_base:data_ratio 1.0
INFO:cognitron_vl.data.dataset_base:data_num 999999999
INFO:cognitron_vl.data.dataset_base:used_num 1138
INFO:cognitron_vl.data.dataset_base:this_data Dataset({
    features: ['conversations', 'source', 'images', 'videos'],
    num_rows: 1138
})
INFO:cognitron_vl.data.dataset_base:raw_data Dataset({
    features: ['conversations', 'images', 'source', 'videos'],
    num_rows: 1728593
})
INFO:cognitron_vl.data.dataset_base:Successful load datasets/LLM/jsons/yuyijiong/Long-Instruction-with-Paraphrasing/short_instruction_from_llama_chinese.jsonl
INFO:cognitron_vl.data.dataset_base:total_num 791
INFO:cognitron_vl.data.dataset_base:data_ratio 1.0
INFO:cognitron_vl.data.dataset_base:data_num 999999999
INFO:cognitron_vl.data.dataset_base:used_num 791
INFO:cognitron_vl.data.dataset_base:this_data Dataset({
    features: ['conversations', 'source', 'images', 'videos'],
    num_rows: 791
})
INFO:cognitron_vl.data.dataset_base:raw_data Dataset({
    features: ['conversations', 'images', 'source', 'videos'],
    num_rows: 1729384
})
INFO:cognitron_vl.data.dataset_base:Successful load datasets/LLM/jsons/yuyijiong/Long-Instruction-with-Paraphrasing/single_doc_qa_zh_paraphrasing.jsonl
INFO:cognitron_vl.data.dataset_base:total_num 23652
INFO:cognitron_vl.data.dataset_base:data_ratio 1.0
INFO:cognitron_vl.data.dataset_base:data_num 999999999
INFO:cognitron_vl.data.dataset_base:used_num 23652
INFO:cognitron_vl.data.dataset_base:this_data Dataset({
    features: ['conversations', 'source', 'images', 'videos'],
    num_rows: 23652
})
INFO:cognitron_vl.data.dataset_base:raw_data Dataset({
    features: ['conversations', 'images', 'source', 'videos'],
    num_rows: 1753036
})
INFO:cognitron_vl.data.dataset_base:Successful load datasets/LLM/jsons/akoksal/LongForm/data.jsonl
INFO:cognitron_vl.data.dataset_base:total_num 9888
INFO:cognitron_vl.data.dataset_base:data_ratio 1.0
INFO:cognitron_vl.data.dataset_base:data_num 999999999
INFO:cognitron_vl.data.dataset_base:used_num 9888
INFO:cognitron_vl.data.dataset_base:this_data Dataset({
    features: ['conversations', 'source', 'images', 'videos'],
    num_rows: 9888
})
INFO:cognitron_vl.data.dataset_base:raw_data Dataset({
    features: ['conversations', 'images', 'source', 'videos'],
    num_rows: 1762924
})
INFO:cognitron_vl.data.dataset_base:Successful load datasets/LLM/jsons/THUDM/LongAlign-10k/long.jsonl
INFO:cognitron_vl.data.dataset_base:total_num 44600
INFO:cognitron_vl.data.dataset_base:data_ratio 1.0
INFO:cognitron_vl.data.dataset_base:data_num 999999999
INFO:cognitron_vl.data.dataset_base:used_num 44600
INFO:cognitron_vl.data.dataset_base:this_data Dataset({
    features: ['conversations', 'source', 'images', 'videos'],
    num_rows: 44600
})
INFO:cognitron_vl.data.dataset_base:raw_data Dataset({
    features: ['conversations', 'images', 'source', 'videos'],
    num_rows: 1807524
})
INFO:cognitron_vl.data.dataset_base:Successful load datasets/LLM/jsons/THUDM/LongCite-45k/long.jsonl
INFO:cognitron_vl.data.dataset_base:total_num 6000
INFO:cognitron_vl.data.dataset_base:data_ratio 1.0
INFO:cognitron_vl.data.dataset_base:data_num 999999999
INFO:cognitron_vl.data.dataset_base:used_num 6000
INFO:cognitron_vl.data.dataset_base:this_data Dataset({
    features: ['conversations', 'source', 'images', 'videos'],
    num_rows: 6000
})
INFO:cognitron_vl.data.dataset_base:raw_data Dataset({
    features: ['conversations', 'images', 'source', 'videos'],
    num_rows: 1813524
})
INFO:cognitron_vl.data.dataset_base:Successful load datasets/LLM/jsons/THUDM/LongWriter-6k/long.jsonl
INFO:cognitron_vl.data.dataset_base:total_num 38821
INFO:cognitron_vl.data.dataset_base:data_ratio 1.0
INFO:cognitron_vl.data.dataset_base:data_num 999999999
INFO:cognitron_vl.data.dataset_base:used_num 38821
INFO:cognitron_vl.data.dataset_base:this_data Dataset({
    features: ['conversations', 'source', 'images', 'videos'],
    num_rows: 38821
})
INFO:cognitron_vl.data.dataset_base:raw_data Dataset({
    features: ['conversations', 'images', 'source', 'videos'],
    num_rows: 1852345
})
INFO:cognitron_vl.data.dataset_base:Successful load datasets/LLM/jsons/YeungNLP/LongQLoRA-Dataset/LongQLoRA-SFT-Data-39k.jsonl
INFO:cognitron_vl.data.dataset_base:total_num 12000
INFO:cognitron_vl.data.dataset_base:data_ratio 1.0
INFO:cognitron_vl.data.dataset_base:data_num 999999999
INFO:cognitron_vl.data.dataset_base:used_num 12000
INFO:cognitron_vl.data.dataset_base:this_data Dataset({
    features: ['conversations', 'source', 'images', 'videos'],
    num_rows: 12000
})
INFO:cognitron_vl.data.dataset_base:raw_data Dataset({
    features: ['conversations', 'images', 'source', 'videos'],
    num_rows: 1864345
})
INFO:cognitron_vl.data.dataset_base:Successful load datasets/LLM/jsons/Yukang/LongAlpaca-12k/LongAlpaca-12k.jsonl
INFO:cognitron_vl.data.dataset_base:total_num 88957
INFO:cognitron_vl.data.dataset_base:data_ratio 1.0
INFO:cognitron_vl.data.dataset_base:data_num 999999999
INFO:cognitron_vl.data.dataset_base:used_num 88957
INFO:cognitron_vl.data.dataset_base:this_data Dataset({
    features: ['conversations', 'source', 'images', 'videos'],
    num_rows: 88957
})
INFO:cognitron_vl.data.dataset_base:raw_data Dataset({
    features: ['conversations', 'images', 'source', 'videos'],
    num_rows: 1953302
})
INFO:cognitron_vl.data.dataset_base:Successful load datasets/LLM/jsons/togethercomputer/Long-Data-Collections/natural_questions_10_200_docs.jsonl
INFO:cognitron_vl.data.dataset_base:total_num 9600
INFO:cognitron_vl.data.dataset_base:data_ratio 1.0
INFO:cognitron_vl.data.dataset_base:data_num 999999999
INFO:cognitron_vl.data.dataset_base:used_num 9600
INFO:cognitron_vl.data.dataset_base:this_data Dataset({
    features: ['conversations', 'source', 'images', 'videos'],
    num_rows: 9600
})
INFO:cognitron_vl.data.dataset_base:raw_data Dataset({
    features: ['conversations', 'images', 'source', 'videos'],
    num_rows: 1962902
})
INFO:cognitron_vl.data.dataset_base:Successful load datasets/LLM/jsons/togethercomputer/Long-Data-Collections/booksum.jsonl
INFO:cognitron_vl.data.dataset_base:total_num 6884
INFO:cognitron_vl.data.dataset_base:data_ratio 0.1
INFO:cognitron_vl.data.dataset_base:data_num 999999999
INFO:cognitron_vl.data.dataset_base:used_num 688
INFO:cognitron_vl.data.dataset_base:this_data Dataset({
    features: ['conversations', 'videos', 'source', 'images'],
    num_rows: 688
})
INFO:cognitron_vl.data.dataset_base:raw_data Dataset({
    features: ['conversations', 'images', 'source', 'videos'],
    num_rows: 1963590
})
INFO:cognitron_vl.data.dataset_base:Successful load datasets/LMM/lcvlm_jsonl/MBZUAI/VideoGPT-plus_Training_Dataset/caption_videochat.jsonl
INFO:cognitron_vl.data.dataset_base:total_num 39949
INFO:cognitron_vl.data.dataset_base:data_ratio 0.1
INFO:cognitron_vl.data.dataset_base:data_num 999999999
INFO:cognitron_vl.data.dataset_base:used_num 3994
INFO:cognitron_vl.data.dataset_base:this_data Dataset({
    features: ['conversations', 'videos', 'source', 'images'],
    num_rows: 3994
})
INFO:cognitron_vl.data.dataset_base:raw_data Dataset({
    features: ['conversations', 'images', 'source', 'videos'],
    num_rows: 1967584
})
INFO:cognitron_vl.data.dataset_base:Successful load datasets/LMM/lcvlm_jsonl/MBZUAI/VideoGPT-plus_Training_Dataset/classification_k710.jsonl
INFO:cognitron_vl.data.dataset_base:total_num 40000
INFO:cognitron_vl.data.dataset_base:data_ratio 0.1
INFO:cognitron_vl.data.dataset_base:data_num 999999999
INFO:cognitron_vl.data.dataset_base:used_num 4000
INFO:cognitron_vl.data.dataset_base:this_data Dataset({
    features: ['conversations', 'videos', 'source', 'images'],
    num_rows: 4000
})
INFO:cognitron_vl.data.dataset_base:raw_data Dataset({
    features: ['conversations', 'images', 'source', 'videos'],
    num_rows: 1971584
})
INFO:cognitron_vl.data.dataset_base:Successful load datasets/LMM/lcvlm_jsonl/MBZUAI/VideoGPT-plus_Training_Dataset/classification_ssv2.jsonl
INFO:cognitron_vl.data.dataset_base:total_num 33923
INFO:cognitron_vl.data.dataset_base:data_ratio 0.1
INFO:cognitron_vl.data.dataset_base:data_num 999999999
INFO:cognitron_vl.data.dataset_base:used_num 3392
INFO:cognitron_vl.data.dataset_base:this_data Dataset({
    features: ['conversations', 'videos', 'source', 'images'],
    num_rows: 3392
})
INFO:cognitron_vl.data.dataset_base:raw_data Dataset({
    features: ['conversations', 'images', 'source', 'videos'],
    num_rows: 1974976
})
INFO:cognitron_vl.data.dataset_base:Successful load datasets/LMM/lcvlm_jsonl/MBZUAI/VideoGPT-plus_Training_Dataset/conversation_videochat1.jsonl
INFO:cognitron_vl.data.dataset_base:total_num 100010
INFO:cognitron_vl.data.dataset_base:data_ratio 0.1
INFO:cognitron_vl.data.dataset_base:data_num 999999999
INFO:cognitron_vl.data.dataset_base:used_num 10001
INFO:cognitron_vl.data.dataset_base:this_data Dataset({
    features: ['conversations', 'videos', 'source', 'images'],
    num_rows: 10001
})
INFO:cognitron_vl.data.dataset_base:raw_data Dataset({
    features: ['conversations', 'images', 'source', 'videos'],
    num_rows: 1984977
})
INFO:cognitron_vl.data.dataset_base:Successful load datasets/LMM/lcvlm_jsonl/MBZUAI/VideoGPT-plus_Training_Dataset/conversation_videochatgpt.jsonl
INFO:cognitron_vl.data.dataset_base:total_num 42620
INFO:cognitron_vl.data.dataset_base:data_ratio 0.1
INFO:cognitron_vl.data.dataset_base:data_num 999999999
INFO:cognitron_vl.data.dataset_base:used_num 4262
INFO:cognitron_vl.data.dataset_base:this_data Dataset({
    features: ['conversations', 'videos', 'source', 'images'],
    num_rows: 4262
})
INFO:cognitron_vl.data.dataset_base:raw_data Dataset({
    features: ['conversations', 'images', 'source', 'videos'],
    num_rows: 1989239
})
INFO:cognitron_vl.data.dataset_base:Successful load datasets/LMM/lcvlm_jsonl/MBZUAI/VideoGPT-plus_Training_Dataset/reasoning_clevrer_mc.jsonl
INFO:cognitron_vl.data.dataset_base:total_num 40000
INFO:cognitron_vl.data.dataset_base:data_ratio 0.1
INFO:cognitron_vl.data.dataset_base:data_num 999999999
INFO:cognitron_vl.data.dataset_base:used_num 4000
INFO:cognitron_vl.data.dataset_base:this_data Dataset({
    features: ['conversations', 'videos', 'source', 'images'],
    num_rows: 4000
})
INFO:cognitron_vl.data.dataset_base:raw_data Dataset({
    features: ['conversations', 'images', 'source', 'videos'],
    num_rows: 1993239
})
INFO:cognitron_vl.data.dataset_base:Successful load datasets/LMM/lcvlm_jsonl/MBZUAI/VideoGPT-plus_Training_Dataset/reasoning_clevrer_qa.jsonl
INFO:cognitron_vl.data.dataset_base:total_num 34132
INFO:cognitron_vl.data.dataset_base:data_ratio 0.1
INFO:cognitron_vl.data.dataset_base:data_num 999999999
INFO:cognitron_vl.data.dataset_base:used_num 3413
INFO:cognitron_vl.data.dataset_base:this_data Dataset({
    features: ['conversations', 'videos', 'source', 'images'],
    num_rows: 3413
})
INFO:cognitron_vl.data.dataset_base:raw_data Dataset({
    features: ['conversations', 'images', 'source', 'videos'],
    num_rows: 1996652
})
INFO:cognitron_vl.data.dataset_base:Successful load datasets/LMM/lcvlm_jsonl/MBZUAI/VideoGPT-plus_Training_Dataset/reasoning_next_qa.jsonl
INFO:cognitron_vl.data.dataset_base:total_num 25803
INFO:cognitron_vl.data.dataset_base:data_ratio 0.1
INFO:cognitron_vl.data.dataset_base:data_num 999999999
INFO:cognitron_vl.data.dataset_base:used_num 2580
INFO:cognitron_vl.data.dataset_base:this_data Dataset({
    features: ['conversations', 'videos', 'source', 'images'],
    num_rows: 2580
})
INFO:cognitron_vl.data.dataset_base:raw_data Dataset({
    features: ['conversations', 'images', 'source', 'videos'],
    num_rows: 1999232
})
INFO:cognitron_vl.data.dataset_base:Successful load datasets/LMM/lcvlm_jsonl/MBZUAI/VideoGPT-plus_Training_Dataset/vcg_human_annotated.jsonl
INFO:cognitron_vl.data.dataset_base:total_num 112716
INFO:cognitron_vl.data.dataset_base:data_ratio 0.1
INFO:cognitron_vl.data.dataset_base:data_num 999999999
INFO:cognitron_vl.data.dataset_base:used_num 11271
INFO:cognitron_vl.data.dataset_base:this_data Dataset({
    features: ['conversations', 'videos', 'source', 'images'],
    num_rows: 11271
})
INFO:cognitron_vl.data.dataset_base:raw_data Dataset({
    features: ['conversations', 'images', 'source', 'videos'],
    num_rows: 2010503
})
INFO:cognitron_vl.data.dataset_base:Successful load datasets/LMM/lcvlm_jsonl/MBZUAI/VideoGPT-plus_Training_Dataset/vcg-plus_112K.jsonl
INFO:cognitron_vl.data.dataset_base:total_num 99923
INFO:cognitron_vl.data.dataset_base:data_ratio 0.1
INFO:cognitron_vl.data.dataset_base:data_num 999999999
INFO:cognitron_vl.data.dataset_base:used_num 9992
INFO:cognitron_vl.data.dataset_base:this_data Dataset({
    features: ['conversations', 'videos', 'source', 'images'],
    num_rows: 9992
})
INFO:cognitron_vl.data.dataset_base:raw_data Dataset({
    features: ['conversations', 'images', 'source', 'videos'],
    num_rows: 2020495
})
INFO:cognitron_vl.data.dataset_base:Successful load datasets/LMM/lcvlm_jsonl/MBZUAI/VideoGPT-plus_Training_Dataset/vqa_webvid_qa.jsonl
INFO:cognitron_vl.data.dataset_base:total_num 221966
INFO:cognitron_vl.data.dataset_base:data_ratio 0.1
INFO:cognitron_vl.data.dataset_base:data_num 999999999
INFO:cognitron_vl.data.dataset_base:used_num 22196
INFO:cognitron_vl.data.dataset_base:this_data Dataset({
    features: ['conversations', 'videos', 'source', 'images'],
    num_rows: 22196
})
INFO:cognitron_vl.data.dataset_base:raw_data Dataset({
    features: ['conversations', 'images', 'source', 'videos'],
    num_rows: 2042691
})
INFO:cognitron_vl.data.dataset_base:Successful load datasets/LMM/lcvlm_jsonl/Share14/ShareGemini/sharegemini_k400.jsonl
INFO:cognitron_vl.data.dataset_base:total_num 101781
INFO:cognitron_vl.data.dataset_base:data_ratio 0.1
INFO:cognitron_vl.data.dataset_base:data_num 999999999
INFO:cognitron_vl.data.dataset_base:used_num 10178
INFO:cognitron_vl.data.dataset_base:this_data Dataset({
    features: ['conversations', 'videos', 'source', 'images'],
    num_rows: 10178
})
INFO:cognitron_vl.data.dataset_base:raw_data Dataset({
    features: ['conversations', 'images', 'source', 'videos'],
    num_rows: 2052869
})
INFO:cognitron_vl.data.dataset_base:Successful load datasets/LMM/lcvlm_jsonl/Share14/ShareGemini/sharegemini_webvid_core100k.jsonl
INFO:cognitron_vl.data.dataset_base:total_num 8959
INFO:cognitron_vl.data.dataset_base:data_ratio 1.0
INFO:cognitron_vl.data.dataset_base:data_num 999999999
INFO:cognitron_vl.data.dataset_base:used_num 8959
INFO:cognitron_vl.data.dataset_base:this_data Dataset({
    features: ['conversations', 'images', 'source', 'videos'],
    num_rows: 8959
})
INFO:cognitron_vl.data.dataset_base:raw_data Dataset({
    features: ['conversations', 'images', 'source', 'videos'],
    num_rows: 2061828
})
INFO:cognitron_vl.data.dataset_base:Successful load datasets/LMM/lcvlm_jsonl/Comic/summary.jsonl
INFO:cognitron_vl.data.dataset_base:total_num 11985
INFO:cognitron_vl.data.dataset_base:data_ratio 1.0
INFO:cognitron_vl.data.dataset_base:data_num 999999999
INFO:cognitron_vl.data.dataset_base:used_num 11985
INFO:cognitron_vl.data.dataset_base:this_data Dataset({
    features: ['conversations', 'videos', 'source', 'images'],
    num_rows: 11985
})
INFO:cognitron_vl.data.dataset_base:raw_data Dataset({
    features: ['conversations', 'images', 'source', 'videos'],
    num_rows: 2073813
})
INFO:cognitron_vl.data.dataset_base:Successful load datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-Video-178K/0_30_s_academic_v0_1/0_30_s_academic_v0_1_cap_processed.jsonl
INFO:cognitron_vl.data.dataset_base:total_num 79346
INFO:cognitron_vl.data.dataset_base:data_ratio 1.0
INFO:cognitron_vl.data.dataset_base:data_num 999999999
INFO:cognitron_vl.data.dataset_base:used_num 79346
INFO:cognitron_vl.data.dataset_base:this_data Dataset({
    features: ['conversations', 'videos', 'source', 'images'],
    num_rows: 79346
})
INFO:cognitron_vl.data.dataset_base:raw_data Dataset({
    features: ['conversations', 'images', 'source', 'videos'],
    num_rows: 2153159
})
INFO:cognitron_vl.data.dataset_base:Successful load datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-Video-178K/0_30_s_youtube_v0_1/0_30_s_youtube_v0_1_cap_processed.jsonl
INFO:cognitron_vl.data.dataset_base:total_num 5753
INFO:cognitron_vl.data.dataset_base:data_ratio 1.0
INFO:cognitron_vl.data.dataset_base:data_num 999999999
INFO:cognitron_vl.data.dataset_base:used_num 5753
INFO:cognitron_vl.data.dataset_base:this_data Dataset({
    features: ['conversations', 'videos', 'source', 'images'],
    num_rows: 5753
})
INFO:cognitron_vl.data.dataset_base:raw_data Dataset({
    features: ['conversations', 'images', 'source', 'videos'],
    num_rows: 2158912
})
INFO:cognitron_vl.data.dataset_base:Successful load datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-Video-178K/0_30_s_academic_v0_1/0_30_s_academic_mc_v0_1_qa_processed.jsonl
INFO:cognitron_vl.data.dataset_base:total_num 48468
INFO:cognitron_vl.data.dataset_base:data_ratio 1.0
INFO:cognitron_vl.data.dataset_base:data_num 999999999
INFO:cognitron_vl.data.dataset_base:used_num 48468
INFO:cognitron_vl.data.dataset_base:this_data Dataset({
    features: ['conversations', 'videos', 'source', 'images'],
    num_rows: 48468
})
INFO:cognitron_vl.data.dataset_base:raw_data Dataset({
    features: ['conversations', 'images', 'source', 'videos'],
    num_rows: 2207380
})
INFO:cognitron_vl.data.dataset_base:Successful load datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-Video-178K/0_30_s_academic_v0_1/0_30_s_academic_oe_v0_1_qa_processed.jsonl
INFO:cognitron_vl.data.dataset_base:total_num 2950
INFO:cognitron_vl.data.dataset_base:data_ratio 1.0
INFO:cognitron_vl.data.dataset_base:data_num 999999999
INFO:cognitron_vl.data.dataset_base:used_num 2950
INFO:cognitron_vl.data.dataset_base:this_data Dataset({
    features: ['conversations', 'videos', 'source', 'images'],
    num_rows: 2950
})
INFO:cognitron_vl.data.dataset_base:raw_data Dataset({
    features: ['conversations', 'images', 'source', 'videos'],
    num_rows: 2210330
})
INFO:cognitron_vl.data.dataset_base:Successful load datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-Video-178K/0_30_s_activitynetqa/0_30_s_activitynetqa_oe_qa_processed.jsonl
INFO:cognitron_vl.data.dataset_base:total_num 5496
INFO:cognitron_vl.data.dataset_base:data_ratio 1.0
INFO:cognitron_vl.data.dataset_base:data_num 999999999
INFO:cognitron_vl.data.dataset_base:used_num 5496
INFO:cognitron_vl.data.dataset_base:this_data Dataset({
    features: ['conversations', 'videos', 'source', 'images'],
    num_rows: 5496
})
INFO:cognitron_vl.data.dataset_base:raw_data Dataset({
    features: ['conversations', 'images', 'source', 'videos'],
    num_rows: 2215826
})
INFO:cognitron_vl.data.dataset_base:Successful load datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-Video-178K/0_30_s_nextqa/0_30_s_nextqa_mc_qa_processed.jsonl
INFO:cognitron_vl.data.dataset_base:total_num 5492
INFO:cognitron_vl.data.dataset_base:data_ratio 1.0
INFO:cognitron_vl.data.dataset_base:data_num 999999999
INFO:cognitron_vl.data.dataset_base:used_num 5492
INFO:cognitron_vl.data.dataset_base:this_data Dataset({
    features: ['conversations', 'videos', 'source', 'images'],
    num_rows: 5492
})
INFO:cognitron_vl.data.dataset_base:raw_data Dataset({
    features: ['conversations', 'images', 'source', 'videos'],
    num_rows: 2221318
})
INFO:cognitron_vl.data.dataset_base:Successful load datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-Video-178K/0_30_s_nextqa/0_30_s_nextqa_oe_qa_processed.jsonl
INFO:cognitron_vl.data.dataset_base:total_num 1785
INFO:cognitron_vl.data.dataset_base:data_ratio 1.0
INFO:cognitron_vl.data.dataset_base:data_num 999999999
INFO:cognitron_vl.data.dataset_base:used_num 1785
INFO:cognitron_vl.data.dataset_base:this_data Dataset({
    features: ['conversations', 'videos', 'source', 'images'],
    num_rows: 1785
})
INFO:cognitron_vl.data.dataset_base:raw_data Dataset({
    features: ['conversations', 'images', 'source', 'videos'],
    num_rows: 2223103
})
INFO:cognitron_vl.data.dataset_base:Successful load datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-Video-178K/0_30_s_perceptiontest/0_30_s_perceptiontest_mc_qa_processed.jsonl
INFO:cognitron_vl.data.dataset_base:total_num 39353
INFO:cognitron_vl.data.dataset_base:data_ratio 1.0
INFO:cognitron_vl.data.dataset_base:data_num 999999999
INFO:cognitron_vl.data.dataset_base:used_num 39353
INFO:cognitron_vl.data.dataset_base:this_data Dataset({
    features: ['conversations', 'videos', 'source', 'images'],
    num_rows: 39353
})
INFO:cognitron_vl.data.dataset_base:raw_data Dataset({
    features: ['conversations', 'images', 'source', 'videos'],
    num_rows: 2262456
})
INFO:cognitron_vl.data.dataset_base:Successful load datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-Video-178K/0_30_s_youtube_v0_1/0_30_s_youtube_mc_v0_1_qa_processed.jsonl
INFO:cognitron_vl.data.dataset_base:total_num 420200
INFO:cognitron_vl.data.dataset_base:data_ratio 1.0
INFO:cognitron_vl.data.dataset_base:data_num 999999999
INFO:cognitron_vl.data.dataset_base:used_num 420200
INFO:cognitron_vl.data.dataset_base:this_data Dataset({
    features: ['conversations', 'videos', 'source', 'images'],
    num_rows: 420200
})
INFO:cognitron_vl.data.dataset_base:raw_data Dataset({
    features: ['conversations', 'images', 'source', 'videos'],
    num_rows: 2682656
})
INFO:cognitron_vl.data.dataset_base:Successful load datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-Video-178K/0_30_s_youtube_v0_1/0_30_s_youtube_oe_v0_1_qa_processed.jsonl
INFO:cognitron_vl.data.dataset_base:total_num 4627
INFO:cognitron_vl.data.dataset_base:data_ratio 1.0
INFO:cognitron_vl.data.dataset_base:data_num 999999999
INFO:cognitron_vl.data.dataset_base:used_num 4627
INFO:cognitron_vl.data.dataset_base:this_data Dataset({
    features: ['conversations', 'videos', 'source', 'images'],
    num_rows: 4627
})
INFO:cognitron_vl.data.dataset_base:raw_data Dataset({
    features: ['conversations', 'images', 'source', 'videos'],
    num_rows: 2687283
})
INFO:cognitron_vl.data.dataset_base:Successful load datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-Video-178K/1_2_m_academic_v0_1/1_2_m_academic_v0_1_cap_processed.jsonl
INFO:cognitron_vl.data.dataset_base:total_num 24234
INFO:cognitron_vl.data.dataset_base:data_ratio 1.0
INFO:cognitron_vl.data.dataset_base:data_num 999999999
INFO:cognitron_vl.data.dataset_base:used_num 24234
INFO:cognitron_vl.data.dataset_base:this_data Dataset({
    features: ['conversations', 'videos', 'source', 'images'],
    num_rows: 24234
})
INFO:cognitron_vl.data.dataset_base:raw_data Dataset({
    features: ['conversations', 'images', 'source', 'videos'],
    num_rows: 2711517
})
INFO:cognitron_vl.data.dataset_base:Successful load datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-Video-178K/1_2_m_youtube_v0_1/1_2_m_youtube_v0_1_cap_processed.jsonl
INFO:cognitron_vl.data.dataset_base:total_num 4241
INFO:cognitron_vl.data.dataset_base:data_ratio 1.0
INFO:cognitron_vl.data.dataset_base:data_num 999999999
INFO:cognitron_vl.data.dataset_base:used_num 4241
INFO:cognitron_vl.data.dataset_base:this_data Dataset({
    features: ['conversations', 'videos', 'source', 'images'],
    num_rows: 4241
})
INFO:cognitron_vl.data.dataset_base:raw_data Dataset({
    features: ['conversations', 'images', 'source', 'videos'],
    num_rows: 2715758
})
INFO:cognitron_vl.data.dataset_base:Successful load datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-Video-178K/1_2_m_academic_v0_1/1_2_m_academic_mc_v0_1_qa_processed.jsonl
INFO:cognitron_vl.data.dataset_base:total_num 26302
INFO:cognitron_vl.data.dataset_base:data_ratio 1.0
INFO:cognitron_vl.data.dataset_base:data_num 999999999
INFO:cognitron_vl.data.dataset_base:used_num 26302
INFO:cognitron_vl.data.dataset_base:this_data Dataset({
    features: ['conversations', 'videos', 'source', 'images'],
    num_rows: 26302
})
INFO:cognitron_vl.data.dataset_base:raw_data Dataset({
    features: ['conversations', 'images', 'source', 'videos'],
    num_rows: 2742060
})
INFO:cognitron_vl.data.dataset_base:Successful load datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-Video-178K/1_2_m_academic_v0_1/1_2_m_academic_oe_v0_1_qa_processed.jsonl
INFO:cognitron_vl.data.dataset_base:total_num 8590
INFO:cognitron_vl.data.dataset_base:data_ratio 1.0
INFO:cognitron_vl.data.dataset_base:data_num 999999999
INFO:cognitron_vl.data.dataset_base:used_num 8590
INFO:cognitron_vl.data.dataset_base:this_data Dataset({
    features: ['conversations', 'videos', 'source', 'images'],
    num_rows: 8590
})
INFO:cognitron_vl.data.dataset_base:raw_data Dataset({
    features: ['conversations', 'images', 'source', 'videos'],
    num_rows: 2750650
})
INFO:cognitron_vl.data.dataset_base:Successful load datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-Video-178K/1_2_m_activitynetqa/1_2_m_activitynetqa_oe_qa_processed.jsonl
INFO:cognitron_vl.data.dataset_base:total_num 4633
INFO:cognitron_vl.data.dataset_base:data_ratio 1.0
INFO:cognitron_vl.data.dataset_base:data_num 999999999
INFO:cognitron_vl.data.dataset_base:used_num 4633
INFO:cognitron_vl.data.dataset_base:this_data Dataset({
    features: ['conversations', 'videos', 'source', 'images'],
    num_rows: 4633
})
INFO:cognitron_vl.data.dataset_base:raw_data Dataset({
    features: ['conversations', 'images', 'source', 'videos'],
    num_rows: 2755283
})
INFO:cognitron_vl.data.dataset_base:Successful load datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-Video-178K/1_2_m_nextqa/1_2_m_nextqa_mc_qa_processed.jsonl
INFO:cognitron_vl.data.dataset_base:total_num 4694
INFO:cognitron_vl.data.dataset_base:data_ratio 1.0
INFO:cognitron_vl.data.dataset_base:data_num 999999999
INFO:cognitron_vl.data.dataset_base:used_num 4694
INFO:cognitron_vl.data.dataset_base:this_data Dataset({
    features: ['conversations', 'videos', 'source', 'images'],
    num_rows: 4694
})
INFO:cognitron_vl.data.dataset_base:raw_data Dataset({
    features: ['conversations', 'images', 'source', 'videos'],
    num_rows: 2759977
})
INFO:cognitron_vl.data.dataset_base:Successful load datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-Video-178K/1_2_m_nextqa/1_2_m_nextqa_oe_qa_processed.jsonl
INFO:cognitron_vl.data.dataset_base:total_num 39710
INFO:cognitron_vl.data.dataset_base:data_ratio 1.0
INFO:cognitron_vl.data.dataset_base:data_num 999999999
INFO:cognitron_vl.data.dataset_base:used_num 39710
INFO:cognitron_vl.data.dataset_base:this_data Dataset({
    features: ['conversations', 'videos', 'source', 'images'],
    num_rows: 39710
})
INFO:cognitron_vl.data.dataset_base:raw_data Dataset({
    features: ['conversations', 'images', 'source', 'videos'],
    num_rows: 2799687
})
INFO:cognitron_vl.data.dataset_base:Successful load datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-Video-178K/1_2_m_youtube_v0_1/1_2_m_youtube_mc_v0_1_qa_processed.jsonl
INFO:cognitron_vl.data.dataset_base:total_num 137645
INFO:cognitron_vl.data.dataset_base:data_ratio 1.0
INFO:cognitron_vl.data.dataset_base:data_num 999999999
INFO:cognitron_vl.data.dataset_base:used_num 137645
INFO:cognitron_vl.data.dataset_base:this_data Dataset({
    features: ['conversations', 'videos', 'source', 'images'],
    num_rows: 137645
})
INFO:cognitron_vl.data.dataset_base:raw_data Dataset({
    features: ['conversations', 'images', 'source', 'videos'],
    num_rows: 2937332
})
INFO:cognitron_vl.data.dataset_base:Successful load datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-Video-178K/1_2_m_youtube_v0_1/1_2_m_youtube_oe_v0_1_qa_processed.jsonl
INFO:cognitron_vl.data.dataset_base:total_num 3124
INFO:cognitron_vl.data.dataset_base:data_ratio 1.0
INFO:cognitron_vl.data.dataset_base:data_num 999999999
INFO:cognitron_vl.data.dataset_base:used_num 3124
INFO:cognitron_vl.data.dataset_base:this_data Dataset({
    features: ['conversations', 'videos', 'source', 'images'],
    num_rows: 3124
})
INFO:cognitron_vl.data.dataset_base:raw_data Dataset({
    features: ['conversations', 'images', 'source', 'videos'],
    num_rows: 2940456
})
INFO:cognitron_vl.data.dataset_base:Successful load datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-Video-178K/2_3_m_academic_v0_1/2_3_m_academic_v0_1_cap_processed.jsonl
INFO:cognitron_vl.data.dataset_base:total_num 24685
INFO:cognitron_vl.data.dataset_base:data_ratio 1.0
INFO:cognitron_vl.data.dataset_base:data_num 999999999
INFO:cognitron_vl.data.dataset_base:used_num 24685
INFO:cognitron_vl.data.dataset_base:this_data Dataset({
    features: ['conversations', 'videos', 'source', 'images'],
    num_rows: 24685
})
INFO:cognitron_vl.data.dataset_base:raw_data Dataset({
    features: ['conversations', 'images', 'source', 'videos'],
    num_rows: 2965141
})
INFO:cognitron_vl.data.dataset_base:Successful load datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-Video-178K/2_3_m_youtube_v0_1/2_3_m_youtube_v0_1_cap_processed.jsonl
INFO:cognitron_vl.data.dataset_base:total_num 6901
INFO:cognitron_vl.data.dataset_base:data_ratio 1.0
INFO:cognitron_vl.data.dataset_base:data_num 999999999
INFO:cognitron_vl.data.dataset_base:used_num 6901
INFO:cognitron_vl.data.dataset_base:this_data Dataset({
    features: ['conversations', 'videos', 'source', 'images'],
    num_rows: 6901
})
INFO:cognitron_vl.data.dataset_base:raw_data Dataset({
    features: ['conversations', 'images', 'source', 'videos'],
    num_rows: 2972042
})
INFO:cognitron_vl.data.dataset_base:Successful load datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-Video-178K/2_3_m_academic_v0_1/2_3_m_academic_mc_v0_1_qa_processed.jsonl
INFO:cognitron_vl.data.dataset_base:total_num 18134
INFO:cognitron_vl.data.dataset_base:data_ratio 1.0
INFO:cognitron_vl.data.dataset_base:data_num 999999999
INFO:cognitron_vl.data.dataset_base:used_num 18134
INFO:cognitron_vl.data.dataset_base:this_data Dataset({
    features: ['conversations', 'videos', 'source', 'images'],
    num_rows: 18134
})
INFO:cognitron_vl.data.dataset_base:raw_data Dataset({
    features: ['conversations', 'images', 'source', 'videos'],
    num_rows: 2990176
})
INFO:cognitron_vl.data.dataset_base:Successful load datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-Video-178K/2_3_m_academic_v0_1/2_3_m_academic_oe_v0_1_qa_processed.jsonl
INFO:cognitron_vl.data.dataset_base:total_num 7460
INFO:cognitron_vl.data.dataset_base:data_ratio 1.0
INFO:cognitron_vl.data.dataset_base:data_num 999999999
INFO:cognitron_vl.data.dataset_base:used_num 7460
INFO:cognitron_vl.data.dataset_base:this_data Dataset({
    features: ['conversations', 'videos', 'source', 'images'],
    num_rows: 7460
})
INFO:cognitron_vl.data.dataset_base:raw_data Dataset({
    features: ['conversations', 'images', 'source', 'videos'],
    num_rows: 2997636
})
INFO:cognitron_vl.data.dataset_base:Successful load datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-Video-178K/2_3_m_activitynetqa/2_3_m_activitynetqa_oe_qa_processed.jsonl
INFO:cognitron_vl.data.dataset_base:total_num 52
INFO:cognitron_vl.data.dataset_base:data_ratio 1.0
INFO:cognitron_vl.data.dataset_base:data_num 999999999
INFO:cognitron_vl.data.dataset_base:used_num 52
INFO:cognitron_vl.data.dataset_base:this_data Dataset({
    features: ['conversations', 'videos', 'source', 'images'],
    num_rows: 52
})
INFO:cognitron_vl.data.dataset_base:raw_data Dataset({
    features: ['conversations', 'images', 'source', 'videos'],
    num_rows: 2997688
})
INFO:cognitron_vl.data.dataset_base:Successful load datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-Video-178K/2_3_m_nextqa/2_3_m_nextqa_mc_qa_processed.jsonl
INFO:cognitron_vl.data.dataset_base:total_num 61
INFO:cognitron_vl.data.dataset_base:data_ratio 1.0
INFO:cognitron_vl.data.dataset_base:data_num 999999999
INFO:cognitron_vl.data.dataset_base:used_num 61
INFO:cognitron_vl.data.dataset_base:this_data Dataset({
    features: ['conversations', 'videos', 'source', 'images'],
    num_rows: 61
})
INFO:cognitron_vl.data.dataset_base:raw_data Dataset({
    features: ['conversations', 'images', 'source', 'videos'],
    num_rows: 2997749
})
INFO:cognitron_vl.data.dataset_base:Successful load datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-Video-178K/2_3_m_nextqa/2_3_m_nextqa_oe_qa_processed.jsonl
INFO:cognitron_vl.data.dataset_base:total_num 39967
INFO:cognitron_vl.data.dataset_base:data_ratio 1.0
INFO:cognitron_vl.data.dataset_base:data_num 999999999
INFO:cognitron_vl.data.dataset_base:used_num 39967
INFO:cognitron_vl.data.dataset_base:this_data Dataset({
    features: ['conversations', 'videos', 'source', 'images'],
    num_rows: 39967
})
INFO:cognitron_vl.data.dataset_base:raw_data Dataset({
    features: ['conversations', 'images', 'source', 'videos'],
    num_rows: 3037716
})
INFO:cognitron_vl.data.dataset_base:Successful load datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-Video-178K/2_3_m_youtube_v0_1/2_3_m_youtube_mc_v0_1_qa_processed.jsonl
INFO:cognitron_vl.data.dataset_base:total_num 141495
INFO:cognitron_vl.data.dataset_base:data_ratio 1.0
INFO:cognitron_vl.data.dataset_base:data_num 999999999
INFO:cognitron_vl.data.dataset_base:used_num 141495
INFO:cognitron_vl.data.dataset_base:this_data Dataset({
    features: ['conversations', 'videos', 'source', 'images'],
    num_rows: 141495
})
INFO:cognitron_vl.data.dataset_base:raw_data Dataset({
    features: ['conversations', 'images', 'source', 'videos'],
    num_rows: 3179211
})
INFO:cognitron_vl.data.dataset_base:Successful load datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-Video-178K/2_3_m_youtube_v0_1/2_3_m_youtube_oe_v0_1_qa_processed.jsonl
INFO:cognitron_vl.data.dataset_base:total_num 10514
INFO:cognitron_vl.data.dataset_base:data_ratio 1.0
INFO:cognitron_vl.data.dataset_base:data_num 999999999
INFO:cognitron_vl.data.dataset_base:used_num 10514
INFO:cognitron_vl.data.dataset_base:this_data Dataset({
    features: ['conversations', 'videos', 'source', 'images'],
    num_rows: 10514
})
INFO:cognitron_vl.data.dataset_base:raw_data Dataset({
    features: ['conversations', 'images', 'source', 'videos'],
    num_rows: 3189725
})
INFO:cognitron_vl.data.dataset_base:Successful load datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-Video-178K/30_60_s_academic_v0_1/30_60_s_academic_v0_1_cap_processed.jsonl
INFO:cognitron_vl.data.dataset_base:total_num 19995
INFO:cognitron_vl.data.dataset_base:data_ratio 1.0
INFO:cognitron_vl.data.dataset_base:data_num 999999999
INFO:cognitron_vl.data.dataset_base:used_num 19995
INFO:cognitron_vl.data.dataset_base:this_data Dataset({
    features: ['conversations', 'videos', 'source', 'images'],
    num_rows: 19995
})
INFO:cognitron_vl.data.dataset_base:raw_data Dataset({
    features: ['conversations', 'images', 'source', 'videos'],
    num_rows: 3209720
})
INFO:cognitron_vl.data.dataset_base:Successful load datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-Video-178K/30_60_s_youtube_v0_1/30_60_s_youtube_v0_1_cap_processed.jsonl
INFO:cognitron_vl.data.dataset_base:total_num 20346
INFO:cognitron_vl.data.dataset_base:data_ratio 1.0
INFO:cognitron_vl.data.dataset_base:data_num 999999999
INFO:cognitron_vl.data.dataset_base:used_num 20346
INFO:cognitron_vl.data.dataset_base:this_data Dataset({
    features: ['conversations', 'videos', 'source', 'images'],
    num_rows: 20346
})
INFO:cognitron_vl.data.dataset_base:raw_data Dataset({
    features: ['conversations', 'images', 'source', 'videos'],
    num_rows: 3230066
})
INFO:cognitron_vl.data.dataset_base:Successful load datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-Video-178K/30_60_s_academic_v0_1/30_60_s_academic_mc_v0_1_qa_processed.jsonl
INFO:cognitron_vl.data.dataset_base:total_num 57924
INFO:cognitron_vl.data.dataset_base:data_ratio 1.0
INFO:cognitron_vl.data.dataset_base:data_num 999999999
INFO:cognitron_vl.data.dataset_base:used_num 57924
INFO:cognitron_vl.data.dataset_base:this_data Dataset({
    features: ['conversations', 'videos', 'source', 'images'],
    num_rows: 57924
})
INFO:cognitron_vl.data.dataset_base:raw_data Dataset({
    features: ['conversations', 'images', 'source', 'videos'],
    num_rows: 3287990
})
INFO:cognitron_vl.data.dataset_base:Successful load datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-Video-178K/30_60_s_academic_v0_1/30_60_s_academic_oe_v0_1_qa_processed.jsonl
INFO:cognitron_vl.data.dataset_base:total_num 4530
INFO:cognitron_vl.data.dataset_base:data_ratio 1.0
INFO:cognitron_vl.data.dataset_base:data_num 999999999
INFO:cognitron_vl.data.dataset_base:used_num 4530
INFO:cognitron_vl.data.dataset_base:this_data Dataset({
    features: ['conversations', 'videos', 'source', 'images'],
    num_rows: 4530
})
INFO:cognitron_vl.data.dataset_base:raw_data Dataset({
    features: ['conversations', 'images', 'source', 'videos'],
    num_rows: 3292520
})
INFO:cognitron_vl.data.dataset_base:Successful load datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-Video-178K/30_60_s_activitynetqa/30_60_s_activitynetqa_oe_qa_processed.jsonl
INFO:cognitron_vl.data.dataset_base:total_num 6843
INFO:cognitron_vl.data.dataset_base:data_ratio 1.0
INFO:cognitron_vl.data.dataset_base:data_num 999999999
INFO:cognitron_vl.data.dataset_base:used_num 6843
INFO:cognitron_vl.data.dataset_base:this_data Dataset({
    features: ['conversations', 'videos', 'source', 'images'],
    num_rows: 6843
})
INFO:cognitron_vl.data.dataset_base:raw_data Dataset({
    features: ['conversations', 'images', 'source', 'videos'],
    num_rows: 3299363
})
INFO:cognitron_vl.data.dataset_base:Successful load datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-Video-178K/30_60_s_nextqa/30_60_s_nextqa_mc_qa_processed.jsonl
INFO:cognitron_vl.data.dataset_base:total_num 6843
INFO:cognitron_vl.data.dataset_base:data_ratio 1.0
INFO:cognitron_vl.data.dataset_base:data_num 999999999
INFO:cognitron_vl.data.dataset_base:used_num 6843
INFO:cognitron_vl.data.dataset_base:this_data Dataset({
    features: ['conversations', 'videos', 'source', 'images'],
    num_rows: 6843
})
INFO:cognitron_vl.data.dataset_base:raw_data Dataset({
    features: ['conversations', 'images', 'source', 'videos'],
    num_rows: 3306206
})
INFO:cognitron_vl.data.dataset_base:Successful load datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-Video-178K/30_60_s_nextqa/30_60_s_nextqa_oe_qa_processed.jsonl
INFO:cognitron_vl.data.dataset_base:total_num 618
INFO:cognitron_vl.data.dataset_base:data_ratio 1.0
INFO:cognitron_vl.data.dataset_base:data_num 999999999
INFO:cognitron_vl.data.dataset_base:used_num 618
INFO:cognitron_vl.data.dataset_base:this_data Dataset({
    features: ['conversations', 'videos', 'source', 'images'],
    num_rows: 618
})
INFO:cognitron_vl.data.dataset_base:raw_data Dataset({
    features: ['conversations', 'images', 'source', 'videos'],
    num_rows: 3306824
})
INFO:cognitron_vl.data.dataset_base:Successful load datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-Video-178K/30_60_s_perceptiontest/30_60_s_perceptiontest_mc_qa_processed.jsonl
INFO:cognitron_vl.data.dataset_base:total_num 39927
INFO:cognitron_vl.data.dataset_base:data_ratio 1.0
INFO:cognitron_vl.data.dataset_base:data_num 999999999
INFO:cognitron_vl.data.dataset_base:used_num 39927
INFO:cognitron_vl.data.dataset_base:this_data Dataset({
    features: ['conversations', 'videos', 'source', 'images'],
    num_rows: 39927
})
INFO:cognitron_vl.data.dataset_base:raw_data Dataset({
    features: ['conversations', 'images', 'source', 'videos'],
    num_rows: 3346751
})
INFO:cognitron_vl.data.dataset_base:Successful load datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-Video-178K/30_60_s_youtube_v0_1/30_60_s_youtube_mc_v0_1_qa_processed.jsonl
INFO:cognitron_vl.data.dataset_base:total_num 110624
INFO:cognitron_vl.data.dataset_base:data_ratio 1.0
INFO:cognitron_vl.data.dataset_base:data_num 999999999
INFO:cognitron_vl.data.dataset_base:used_num 110624
INFO:cognitron_vl.data.dataset_base:this_data Dataset({
    features: ['conversations', 'videos', 'source', 'images'],
    num_rows: 110624
})
INFO:cognitron_vl.data.dataset_base:raw_data Dataset({
    features: ['conversations', 'images', 'source', 'videos'],
    num_rows: 3457375
})
INFO:cognitron_vl.data.dataset_base:Successful load datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-Video-178K/30_60_s_youtube_v0_1/30_60_s_youtube_oe_v0_1_qa_processed.jsonl
INFO:cognitron_vl.data.dataset_base:total_num 255000
INFO:cognitron_vl.data.dataset_base:data_ratio 1.0
INFO:cognitron_vl.data.dataset_base:data_num 999999999
INFO:cognitron_vl.data.dataset_base:used_num 255000
INFO:cognitron_vl.data.dataset_base:this_data Dataset({
    features: ['conversations', 'videos', 'source', 'images'],
    num_rows: 255000
})
INFO:cognitron_vl.data.dataset_base:raw_data Dataset({
    features: ['conversations', 'images', 'source', 'videos'],
    num_rows: 3712375
})
INFO:cognitron_vl.data.dataset_base:Successful load datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-Video-178K/llava_hound/sharegptvideo_qa_255k_processed.jsonl
INFO:cognitron_vl.data.dataset_base:output_xlsx /data_2/output/LM/scripts/modellink/qwen25/finetune_qwen25_14b_intern_300m_ptd_tp8pp1cp2_stage3.sh/20241127_204213//lcvlm_finetune_stage3.xlsx
[h264 @ 0x563d7a5b3340] mmco: unref short failure
[h264 @ 0x563d7e500440] mmco: unref short failure
[h264 @ 0x563d7e500440] mmco: unref short failure
[h264 @ 0x563d7e500440] mmco: unref short failure
INFO:cognitron_vl.data.dataset_base:raw_data Dataset({
    features: ['conversations', 'images', 'source', 'videos'],
    num_rows: 3712375
})
INFO:cognitron_vl.data.dataset_base:raw_data {'conversations': [[{'from': 'human', 'value': '<video>\nWhat does the man do after peeling off the backing of the foam tape?'}, {'from': 'gpt', 'value': 'He starts applying the foam tape around the edges of the unit, ensuring a proper seal.'}, {'from': 'human', 'value': 'Where is the yellow curtain located in relation to the air conditioning unit?'}, {'from': 'gpt', 'value': 'The yellow curtain is located above the air conditioning unit.'}, {'from': 'human', 'value': 'Why does the man press the foam tape along the edges of the air conditioning unit?'}, {'from': 'gpt', 'value': 'To ensure it adheres properly and creates a tight seal.'}, {'from': 'human', 'value': 'Where does the video take place?'}, {'from': 'gpt', 'value': 'The video takes place in front of a white Haier air conditioning unit with a yellow curtain above it.'}, {'from': 'human', 'value': 'What is the man wearing in the video?'}, {'from': 'gpt', 'value': 'The man is wearing a red cap and a dark blue hoodie.'}], [{'from': 'user', 'value': 'What is the balanced chemical equation for the decomposition of sodium bicarbonate (baking soda) when heated?'}, {'from': 'assistant', 'value': 'When sodium bicarbonate (NaHCO3) is heated, it decomposes into sodium carbonate (Na2CO3), water (H2O), and carbon dioxide (CO2). The balanced chemical equation for this decomposition reaction is:\n\n2 NaHCO3 (s) → Na2CO3 (s) + H2O (g) + CO2 (g)'}], [{'from': 'user', 'value': '<image>\nWhat is the name of the product described in the text?'}, {'from': 'assistant', 'value': 'The name of the product is Port & Company® Fleece-Lined Knit Cap, model CP90L.'}, {'from': 'user', 'value': 'What improvements were made to the popular knit cap style?'}, {'from': 'assistant', 'value': 'The popular knit cap style was improved by fully lining it with 100% polyester fleece for additional warmth and comfort.'}, {'from': 'user', 'value': 'What materials are used in the fabric and lining of the knit cap?'}, {'from': 'assistant', 'value': 'The knit cap is made of 100% acrylic fabric with 100% polyester fleece lining.'}, {'from': 'user', 'value': 'What are the care instructions for the fleece-lined knit cap?'}, {'from': 'assistant', 'value': 'The care instructions for the fleece-lined knit cap are to hand wash and lay flat to dry.'}, {'from': 'user', 'value': 'Can you list some of the colors available for the fleece-lined knit cap?'}, {'from': 'assistant', 'value': 'The colors available for the fleece-lined knit cap include Athletic Green, Athletic Orange, Athletic Oxford, Athletic Red, Athletic Royal Black, Maroon, Navy, Neon Pink, Glo Neon Yellow, Neon Yellow, and White.'}, {'from': 'user', 'value': 'What is the PMS color code for Athletic Green?'}, {'from': 'assistant', 'value': 'The PMS color code for Athletic Green is 627C.'}, {'from': 'user', 'value': 'What color has no available PMS match?'}, {'from': 'assistant', 'value': 'The color that has no available PMS match is Royal Black.'}], [{'from': 'user', 'value': 'Can you provide a Rust program that takes user input and displays it back?'}, {'from': 'assistant', 'value': 'Sure! Here\'s a Rust program that accomplishes that:\n\n```rust\nuse std::io;\n\nfn main() {\n    println!("Please enter your input:");\n\n    let mut input = String::new();\n\n    io::stdin().read_line(&mut input).expect("Failed to read line");\n\n    println!("You entered: {}", input);\n}\n```\n\nExplanation: \nIn this program, we import the `io` module from the `std` (standard) library to handle input and output operations. \n\nFirst, we print a prompt message asking the user to enter their input using `println!("Please enter your input:")`.\n\nThen, we create a mutable string variable named `input` to store the user\'s input. We use `String::new()` to initialize it.\n\nNext, we use `io::stdin().read_line(&mut input)` to read the user\'s input and store it in the `input` variable. \n\nThe `read_line` function takes a mutable reference to `input` to store the input into the variable. If the `read_line` operation fails, it will raise an error using `expect("Failed to read line")`.\n\nFinally, we display the user\'s input back to them using `println!("You entered: {}", input)`.\n\nNow, when you run this program, it will prompt you to enter your input. After you provide the input and press enter, it will display the input back to you.'}], [{'from': 'human', 'value': 'Analyze the image in a comprehensive and detailed manner.\n<image>'}, {'from': 'gpt', 'value': 'In the heart of a serene landscape, a vintage biplane is at rest on a lush, grassy field. The plane, facing towards the right side of the image, boasts a striking color scheme with a yellow top wing and a gray bottom wing. The tail of the plane is marked with the letters "BH", and a black and white cross adorns its side, adding to its vintage charm. \n\nThe plane\'s propeller, poised for action, is clearly visible at the front. The wheels of the plane are firmly planted on the ground, ready for the next adventure. In the background, a tranquil scene unfolds with verdant trees standing tall against the blue sky dotted with fluffy clouds. The entire scene paints a picture of a bygone era, evoking a sense of nostalgia and adventure.'}], [{'from': 'user', 'value': 'How many days in July were spent in the lab for the mentioned studies?\n<image>'}, {'from': 'assistant', 'value': 'There were 10 days in the lab from July 1st to 10th and 5 days in the lab from July 12th to 16th. In total, there were 15 days spent in the lab for the mentioned studies.'}, {'from': 'user', 'value': 'On which dates in July 2021 was the lab closed?'}, {'from': 'assistant', 'value': 'The lab was closed on weekends (Saturday and Sunday), and specific dates in July when the lab was closed are as follows: July 4th, 11th, 18th, and 25th.'}, {'from': 'user', 'value': 'How many days in August were spent in the lab for the mentioned studies?'}, {'from': 'assistant', 'value': 'There were 10 days in the lab from August 2nd to August 11th, 5 days in the lab from August 9th to August 13th, and 5 days in the lab from August 16th to August 20th. In total, there were 20 days spent in the lab for the mentioned studies.'}, {'from': 'user', 'value': 'What is the protocol for rush cases and for cases over 6 units in the lab?'}, {'from': 'assistant', 'value': 'For rush cases, you should call the lab to coordinate. For cases over 6 units, please allow for 5 additional days in the lab.'}, {'from': 'user', 'value': 'What is the contact number for the lab?'}, {'from': 'assistant', 'value': 'The contact number for the lab is 734-316-7053.'}], [{'from': 'human', 'value': 'Explore the intriguing world of creating aged balsamic vinegar from Modena, Italy, using traditional methods passed down through generations. Delve into the meticulous process of selecting the finest grapes, the specific timing of their harvest, and the secrets behind the aging process that results in a unique and complex flavor profile. Reflect on the cultural significance of this artisanal product in Italian cuisine and showcase your understanding of the intricate craftsmanship required to produce such a rare and highly sought-after ingredient.'}, {'from': 'gpt', 'value': 'Balsamic vinegar is a staple in Italian cuisine, but not all balsamic vinegar is created equal. The aged balsamic vinegar from Modena, Italy, is a true artisanal product that takes years of meticulous care to produce. This vinegar is made from the must, or juice, of selected grapes grown in the Modena region. The must is then cooked and aged in a succession of barrels made from different types of wood. The final product is a thick, syrupy vinegar with a complex flavor profile that is a result of the unique aging process.\n\nThe process of creating aged balsamic vinegar begins with selecting the right grapes. The grape varieties most commonly used are Trebbiano and Lambrusco. The grapes are carefully picked at the perfect ripeness, usually in September or October. They are then crushed, and the must is extracted. The must is then cooked in an open vat over a wood fire until it is reduced by half. This process can take up to 24 hours.\n\nAfter the cooking process, the must is then transferred to a series of wooden barrels. The barrels are made from various types of wood, including oak, chestnut, cherry, and ash. Each barrel has a specific size and shape that affects the final flavor of the vinegar. The barrels are stacked on top of each other, and the must is transferred from barrel to barrel, with the smallest barrel at the top and the largest at the bottom. The vinegar is aged in these barrels for a minimum of 12 years, and sometimes up to 25 years or more.\n\nThe aging process is critical to the flavor of the vinegar. As the vinegar ages, it absorbs the flavors and aromas of the wood, as well as the vinegar that was previously stored in the barrel. The vinegar also evaporates over time, becoming thicker and more syrupy. The final product is a rich, complex vinegar with a sweet, tangy flavor that is perfect for use in salads, marinades, and sauces.\n\nAged balsamic vinegar is a cherished product in Modena, Italy, and is often given as a gift to friends and family. It is also used in traditional Italian dishes, such as Parmigiano-Reggiano cheese, roasted meats, and fresh fruit. The art of creating aged balsamic vinegar has been passed down through generations and is a source of pride for the people of Modena.\n\nIn conclusion, creating aged balsamic vinegar is a true labor of love that requires patience, skill, and a deep respect for tradition. The result is a unique and complex vinegar that is a testament to the rich culinary heritage of Italy.'}], [{'from': 'human', 'value': '<image>\nPlease enlighten me with a detailed description of the image.'}, {'from': 'gpt', 'value': 'The image displays a corner of a room designed as a makeshift art gallery for children\'s artwork. On the wall, there\'s a sign reading "Nikolaus Fine Art Gallery" in a decorative font. Below the sign, several pieces of paper are attached to three horizontal white strings stretched across the wall. The papers feature various child-like drawings and paintings, some with vibrant colors and others in pencil or crayon. On the left, a red circular object sits on the floor, possibly a bucket. In front of the wall, there is a chair with a dark brown seat and backrest, a white pillow with an airplane silhouette, and a black throw on one side. The chair sits atop a beige carpet, and behind it is a curtain divided into alternating stripes of black and beige. On the right edge of the image, there\'s a watermark or logo that reads "classy clutter."'}], [{'from': 'human', 'value': '<image>\nPlease generate detailed descriptions of the given image.'}, {'from': 'gpt', 'value': "The image captures a dynamic moment in a soccer match. The central figure is a soccer player, donned in a white jersey with the number 9 emblazoned on it. He is in the midst of a powerful kick, his right foot extended towards the ball. The ball, a vibrant mix of blue and white, is in mid-air, just a few feet away from his foot, indicating the action of the kick.\n\nThe player's body is angled towards the right side of the image, suggesting a strong, forward-moving force behind his kick. His left arm is extended outwards, possibly for balance or to signal a teammate.\n\nIn the background, two other players can be seen. One is wearing a white jersey with the number 13, while the other sports a white jersey with the number 5. They are both positioned on the left side of the image, their attention focused on the ongoing action.\n\nThe setting is a soccer field, with a goal post visible in the background. The field is lush and green, providing a stark contrast to the players' white jerseys. The goal post stands tall and imposing, a silent sentinel overseeing the unfolding match.\n\nOverall, the image is a snapshot of a thrilling moment in a soccer game, filled with anticipation and excitement."}], [{'from': 'user', 'value': '<image>\nWhat does the document present?'}, {'from': 'assistant', 'value': 'The document presents a Condensed Consolidated Statement of Grant County Bank at the closing of business on March 31, 2020.'}, {'from': 'user', 'value': 'What is the total amount of Cash and Due from Banks on March 31, 2020?'}, {'from': 'assistant', 'value': 'The total amount of Cash and Due from Banks on March 31, 2020, is $7,599,000.'}, {'from': 'user', 'value': 'What is the total amount of Municipal Securities owned by Grant County Bank?'}, {'from': 'assistant', 'value': 'The total amount of Municipal Securities owned by Grant County Bank is $24,335,000.'}, {'from': 'user', 'value': 'What was the total value of Loans on March 31, 2020, at Grant County Bank?'}, {'from': 'assistant', 'value': 'The total value of Loans at Grant County Bank on March 31, 2020, was $150,150,000.'}, {'from': 'user', 'value': 'How much is the Reserve for Loan Losses at Grant County Bank?'}, {'from': 'assistant', 'value': 'The Reserve for Loan Losses at Grant County Bank is $3,457,000.'}, {'from': 'user', 'value': 'What is the total value of Bank Building, Furniture, and Fixtures?'}, {'from': 'assistant', 'value': 'The total value of Bank Building, Furniture, and Fixtures is $5,232,000.'}, {'from': 'user', 'value': 'How much Capital does Grant County Bank have?'}, {'from': 'assistant', 'value': 'Grant County Bank has $1,000,000 in Capital.'}, {'from': 'user', 'value': 'Who is the CEO of Grant County Bank?'}, {'from': 'assistant', 'value': 'The CEO of Grant County Bank is Thomas J. Pinnick.'}, {'from': 'user', 'value': 'Who are the Vice Presidents of Customer Service and Human Resources at Grant County Bank?'}, {'from': 'assistant', 'value': 'The Vice Presidents of Customer Service and Human Resources at Grant County Bank are Trisha Walker and Beverly G. Ketchum, respectively.'}, {'from': 'user', 'value': 'What is the total amount of deposits in Grant County Bank?'}, {'from': 'assistant', 'value': 'The total amount of deposits in Grant County Bank is $226,917,000.'}, {'from': 'user', 'value': 'How many directors does Grant County Bank have, and who are they?'}, {'from': 'assistant', 'value': 'Grant County Bank has five directors: Thomas J. Pinnick (Chairman), Joseph M. Sullivan, Dr. Robert V. Pinnick, and Marieta A. Hauser.'}, {'from': 'user', 'value': 'How much is Other Borrowed Money at Grant County Bank?'}, {'from': 'assistant', 'value': 'Other Borrowed Money at Grant County Bank is $8,149,000.'}, {'from': 'user', 'value': 'How many Vice Presidents of Consumer Loans are listed in the document?'}, {'from': 'assistant', 'value': 'There are three Vice Presidents of Consumer Loans listed in the document: Cynthia L. Davis, John Nickel, and Josh Sullivan.'}, {'from': 'user', 'value': 'Who is the Branch Manager of Grant County Bank?'}, {'from': 'assistant', 'value': 'The Branch Manager of Grant County Bank is Randy Bartel.'}]], 'images': [None, None, ['datasets/LMM/HuggingFaceM4/Docmatix/images/363881f85b1c96eed9de15d98780a64e.png'], None, ['datasets/CV/coco//train2017/000000058563.jpg'], ['datasets/LMM/HuggingFaceM4/Docmatix/images/05f2cfb5a4b71c172f4a16bda7b9ecd7.png'], None, ['/data/data/LMM/FreedomIntelligence/ALLaVA-4V/allava_laion/images/100275960.jpeg'], ['datasets/LMM/lmms-lab/LLaVA-ReCap-CC3M/images/002411038.png'], ['datasets/LMM/HuggingFaceM4/Docmatix/images/bdce105da9887773e154ee7e4bea95f8.png']], 'source': ['datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-Video-178K/1_2_m_youtube_v0_1/1_2_m_youtube_oe_v0_1_qa_processed.jsonl', 'datasets/LLM/jsons/teknium/OpenHermes-2.5/openhermes2_5.jsonl', 'datasets/LMM/lcvlm_jsonl/HuggingFaceM4/Docmatix/data.jsonl', 'datasets/LLM/jsons/teknium/OpenHermes-2.5/openhermes2_5.jsonl', 'datasets/LMM/lcvlm_jsonl/Lin-Chen/ShareGPT4V/sharegpt4v_instruct_gpt4-vision_cap100k.jsonl', 'datasets/LMM/lcvlm_jsonl/HuggingFaceM4/Docmatix/data.jsonl', 'datasets/LLM/jsons/YeungNLP/LongQLoRA-Dataset/LongQLoRA-SFT-Data-39k.jsonl', 'datasets/LMM/lcvlm_jsonl/FreedomIntelligence/ALLaVA-4V/allava_laion/ALLaVA-Caption-LAION-4V.jsonl', 'datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-ReCap-CC3M/data.jsonl', 'datasets/LMM/lcvlm_jsonl/HuggingFaceM4/Docmatix/data.jsonl'], 'videos': [['datasets/LMM/lmms-lab/LLaVA-Video-178K/liwei_youtube_videos/videos/youtube_video_2024/ytb_oOyURLf4ytI.mp4'], None, None, None, None, None, None, None, None, None]}
INFO:cognitron_vl.data.dataset_base:raw_data {'conversations': [[{'from': 'user', 'value': "I've been exploring ways to generate unique IDs in my program, and I specifically need the IDs to be 8 digits long. I recently discovered that I can set an ID auto increment field to start from a specific number. If I set it to start from 11111111, will all new records increment from this value? If so, this would provide me with my desired unique 8-digit ID without the need for a function. Additionally, if a record is deleted, will a new record reuse that deleted ID?"}, {'from': 'assistant', 'value': '1) Autoincrement values are not reused in most database systems. Once an ID is assigned to a record, it will not be reused, even if the record is deleted.\n\n2) Yes, you are correct! By setting the autoincrement counter to a starting value of 11111111 (or any 8-digit number), new records will increment from this value. This can be achieved by using the table option "AUTO_INCREMENT = N" in your database.\n\nHere is an example of how you can implement this in MySQL:\n\n```sql\nCREATE TABLE my_table (\n  id INT AUTO_INCREMENT PRIMARY KEY,\n  data VARCHAR(255)\n) AUTO_INCREMENT = 11111111;\n```\n\nIn this example, the "id" column will start from 11111111 for the first record and increment by 1 for each new record.\n\nBy using this approach, you can generate unique 8-digit IDs without the need for a separate function.'}], [{'from': 'human', 'value': 'Write a high-quality answer for the given question using only the provided search results (some of which might be irrelevant).\n\nDocument [1] (Title: Hindi) Trinidad and Tobago; 3,000 in Singapore. Linguistically, Hindi and Urdu are two registers of the same language and are mutually intelligible. Hindi is written in the Devanagari script and uses more Sanskrit words, whereas Urdu is written in the Perso-Arabic script and uses more Arabic and Persian words. Hindi is the most commonly used official language in India. Urdu is the and "lingua franca" of Pakistan and is one of 22 official languages of India. The splitting of Hindi and Urdu into separate languages is largely motivated by politics, namely the Indo-Pakistani rivalry. Hindi is written in the Devanagari script,\nDocument [2] (Title: Anti-Hindi agitations of Tamil Nadu) school activities during Hindi classes. The Indian Constituent Assembly was established on 9 December 1946, for drafting a Constitution when India became independent. The Constituent Assembly witnessed fierce debates on the language issue. The adoption of a National Language, the language in which the Constitution was to be written in and the language in which the proceedings of the Constituent Assembly were to be conducted were the main linguistic questions debated by the framers of the Constitution. On one side were members from the "Hindi-speaking provinces" like Algu Rai Sastri, R.V. Dhulekar, Balkrishna Sharma, Purushottam Das Tandon, (all from United\nDocument [3] (Title: Hindi–Urdu controversy) done despite the fact that the great majority of people in the northern plains of India were Hindu with an affinity for Hindi and for the Devanagri script. Why exactly this decision was taken is unclear. The most immediate reason for the controversy is believed to be the contradictory language policy in North India in the 1860s. Although the then government encouraged both Hindi and Urdu as a medium of education in school, it discouraged Hindi or Nagari script for official purposes. This policy gave rise to conflict between students educated in Hindi or Urdu for the competition of government\nDocument [4] (Title: Languages of India) exactly the opposite. In the event, as 1965 approached, India\'s new Prime Minister Lal Bahadur Shastri prepared to make Hindi paramount with effect from 26 January 1965. This led to widespread agitation, riots, self-immolations and suicides in Tamil Nadu. The split of Congress politicians from the South from their party stance, the resignation of two Union ministers from the South and the increasing threat to the country\'s unity forced Shastri to concede. As a result, the proposal was dropped, and the Act itself was amended in 1967 to provide that the use of English would not be ended until a\nDocument [5] (Title: Hindi media) Hindi media Hindi media refers to media in Hindi language and its dialects, across the Hindi belt in India, and elsewhere with the Hindi-speaking Indian diaspora. The first Hindi-language newspaper published in India, "Udant Martand" (The Rising Sun), started on 30 May 1826. This day is celebrated as "Hindi Journalism Day", or "Hindi Patrakarita Diwas", as it marked the beginning of journalism in Hindi language. India has a long history of printing. The first printing press was set up as early as in 1674 in Mumbai (Bombay). "Calcutta General Advertiser", the first newspaper of India (also known as the "Hicky\'s\nDocument [6] (Title: Languages of India) thought to be a violation of the constitution\'s guarantee of federalism. Later, a constitutional amendment, , allowed for the continuation of English in the Indian government indefinitely until legislation decides to change it. The form of numerals to be used for the official purposes of the Union were supposed to be the international form of Indian numerals, distinct from the numerals used in most English-speaking countries. Despite the misconceptions, Hindi is not the national language of India. The Constitution of India does not give any language the status of national language. The Eighth Schedule of the Indian Constitution lists 22\nDocument [7] (Title: Indira Gandhi) social classes challenged the political establishment of the Hindi Belt in the years to come. Under the Constitution of India of 1950, Hindi was to have become the official national language by 1965. This was not acceptable to many non-Hindi speaking states, who wanted the continued use of English in government. In 1967, Gandhi made a constitutional amendment that guaranteed the de facto use of both Hindi and English as official languages. This established the official government policy of bilingualism in India and satisfied the non-Hindi speaking Indian states. Gandhi thus put herself forward as a leader with a pan-Indian\nDocument [8] (Title: Languages with official status in India) Languages with official status in India The Constitution of India designates the official language of the Government of India as Hindi written in the Devanagari script, as well as English. There is no national language as declared by the Constitution of India. Hindi is used for official purposes such as parliamentary proceedings, judiciary, communications between the Central Government and a State Government. States within India have the liberty and powers to specify their own official language(s) through legislation and therefore there are 22 officially recognized languages in India of which Hindi is the most used. The number of native Hindi\nDocument [9] (Title: Hindi media) the online versions of the Hindi newspapers and news channels. Hindi media Hindi media refers to media in Hindi language and its dialects, across the Hindi belt in India, and elsewhere with the Hindi-speaking Indian diaspora. The first Hindi-language newspaper published in India, "Udant Martand" (The Rising Sun), started on 30 May 1826. This day is celebrated as "Hindi Journalism Day", or "Hindi Patrakarita Diwas", as it marked the beginning of journalism in Hindi language. India has a long history of printing. The first printing press was set up as early as in 1674 in Mumbai (Bombay). "Calcutta General Advertiser",\nDocument [10] (Title: Biharis) the subordinate label of Hindi in the 1961 Census. Such state and national politics are creating conditions for language endangerment. The first success for spreading Hindi occurred in Bihar in 1881, when Hindi displaced Urdu as the sole official language of the province. In this struggle between competing Hindi and Urdu, the potential claims of the three large mother tongues in the region – Bhojpuri, Maithili and Magahi were ignored. After independence Hindi was again given the sole official status through the Bihar Official Language Act, 1950. Urdu became the second official language in the undivided State of Bihar on\nDocument [11] (Title: Anti-Hindi agitations of Tamil Nadu) sole official language and to phase out the use of English (Article 344). Official communication between states and between states and the Union would be in the official language of the union (Article 345). English would be used for all legal purposes – in court proceedings, bills, laws, rules and other regulations (Article 348). The Union was duty bound to promote the spread and usage of Hindi (Article 351). India became independent on 15 August 1947 and the Constitution was adopted on 26 January 1950. The adoption of English as official language along with Hindi was heavily criticized by pro-Hindi\nDocument [12] (Title: Hindi) Uttar Pradesh, for instance, depending on the political formation in power, this language is generally Urdu. Similarly, Hindi is accorded the status of official language in the following Union Territories: Andaman & Nicobar Islands, Chandigarh, Dadra & Nagar Haveli, Daman & Diu, National Capital Territory. National language status for Hindi is a long-debated theme. In 2010, the Gujarat High Court clarified that Hindi is not the national language of India because the constitution does not mention it as such. Outside Asia, the Awadhi language (A Hindi dialect) with influence from Bhojpuri, Bihari languages, Fijian and English is spoken in Fiji.\nDocument [13] (Title: Anti-Hindi agitations of Tamil Nadu) lathi charge when the agitation grew violent. In Andhra Pradesh, trains were damaged and colleges were shut down. Efforts to amend the Official Languages Act according to Shastri\'s assurances given in February 1965 faced stiff resistance from the pro-Hindi lobby. On 16 February 55 MPs from 8 different states publicly expressed their disapproval of any change in the Language policy. On 19 February 19 MPs from Maharashtra and Gujarat voiced their opposition for change and on 25 February 106 Congress MPs met the Prime Minister to request him not to amend the Act. However, Congress MPs from Madras did not\nDocument [14] (Title: Anti-Hindi agitation of 1937–40) the death of two protesters and the arrest of 1,198 persons including women and children. The mandatory Hindi education was later withdrawn by the British Governor of Madras Lord Erskine in February 1940 after the resignation of the Congress government in 1939. The Republic of India has hundreds of languages. According to the Census of 2001, there are 1,635 rationalized mother tongues and 122 languages with more than 10,000 speakers. During the British Raj, English was the official language. When the Indian Independence Movement gained momentum in the early part of the 20th century, efforts were undertaken to make Hindustani\nDocument [15] (Title: Hindustani language) declared by the Constitution of India as the "official language (राजभाषा, "rājabhāśā") of the Union" (Art. 343(1)) (In this context, "Union" means the Federal Government and not the entire country – India has 23 official languages). At the same time, however, the definitive text of federal laws is officially the English text and proceedings in the higher appellate courts must be conducted in English. At the state level, Hindi is one of the official languages in 9 of the 29 Indian states and three Union Territories (respectively, Uttar Pradesh, Bihar, Jharkhand, Uttarakhand, Madhya Pradesh, Rajasthan, Chhattisgarh, Himachal Pradesh, and Haryana;\nDocument [16] (Title: Raghunath Vinayak Dhulekar) as India\'s national language. The bill was passed and ruled that Hindi would become the nation\'s official language in 1965. However, Hindi was never made the national language as a result of the Anti-Hindi agitations of Tamil Nadu. In 1946, Dhulekar was elected as a member of Constituent Assembly of India. From 1952 to 1957, he served one term as a member of the Parliament of India,1st Lok Sabha. From 1958 to 1964, he was also elected as Chairman of Uttar Pradesh Legislative Council. Raghunath Vinayak Dhulekar Pandit Raghunath Vinayak Dhulekar (7 January 1891 in Jhansi, Uttar Pradesh – 1980)\nDocument [17] (Title: Madras State) make Tamil the medium of instruction in colleges, saying that it was "not a practical proposition, not in the interests of national integration, not in the interests of higher education, and not in the interests of the students themselves". On 7 March 1964, at a session of the Madras Legislative Assembly, Bhaktavatsalam recommended the introduction of a three-language formula comprising English, Hindi and Tamil. As 26 January 1965, the day when the 15-year-long transition period recommended by the Indian Parliament came to an end, neared, the agitations intensified, leading to police action and casualties. Five of the agitators (Sivalingam, Aranganathan,\nDocument [18] (Title: Lal Bahadur Shastri) appointed Indira Gandhi, daughter of Jawaharlal Nehru and former Congress President, as the Minister of Information and Broadcasting. Gulzarilal Nanda continued as the Minister of Home Affairs. Lal Bahadur Shastri\'s tenure witnessed the Madras anti-Hindi agitation of 1965. The government of India had for a long time made an effort to establish Hindi as the sole national language of India. This was resisted by the non-Hindi speaking states particularly Madras State. To calm the situation, Shastri gave assurances that English would continue to be used as the official language as long the non-Hindi speaking states wanted. The riots subsided after\nDocument [19] (Title: Anti-Hindi agitations of Tamil Nadu) conduct of the public services exam in all regional languages. These decisions were agreed upon during the Chief Ministers\' meeting which was held on 24 February. The three language formula was not strictly enforced either in South or Hindi-speaking areas. The changes to public services exams were impractical and not well received by government officials. The only real concession to the south was the assurance that the Official Languages Act would be modified. However, any effort to follow through with that pledge received stiff resistance. In April 1965, a meeting of a cabinet sub-committee comprising Gulzari lal Nanda, A. K.\nDocument [20] (Title: Hindi–Urdu controversy) and Oudh", in which, he made a compelling case for Hindi. Several Hindi movements were formed in the late 19th and early 20th century; notable among them were Nagari Pracharini Sabha formed in Banaras in 1893, Hindi Sahitya Sammelan in Allahabad in 1910, Dakshina Bharat Hindi Prachar Sabha in 1918 and Rashtra Bhasha Prachar Samiti in 1926. The movement was encouraged in 1881 when Hindi in Devanagari script replaced Urdu in Persian script as the official language in neighboring Bihar. They submitted 118 memorials signed by 67,000 people to the Education Commission in several cities. The proponents of Hindi argued\nDocument [21] (Title: Official languages of the United Nations) are third language speakers. Hindi is the lingua franca of the majority of Indians in the subcontinent, along with Pakistan (as Urdu), Sri Lanka, Bangladesh and Nepal, with its importance as a global language increasing day by day. Hindi is the fourth most-spoken first language in the world, after Mandarin, Spanish and English. In 2007, it was reported that the government would "make immediate diplomatic moves to seek the status of an official language for Hindi at the United Nations". According to a 2009 press release from its Ministry of External Affairs, the Government of India has been "working actively"\nDocument [22] (Title: Hindi) Standard Hindi is based on the Khariboli dialect, the vernacular of Delhi and the surrounding region, which came to replace earlier prestige dialects such as Awadhi, Maithili (sometimes regarded as separate from the Hindi dialect continuum) and Braj. "Urdu" – another form of Hindustani – acquired linguistic prestige in the later Mughal period (1800s), and underwent significant Persian influence. Modern Hindi and its literary tradition evolved towards the end of the 18th century. However, modern Hindi\'s earlier literary stages before standardization can be traced to the 16th century. In the late 19th century, a movement to further develop Hindi as\nDocument [23] (Title: Culture of India) languages in the 8th schedule of Indian constitution. Oriya\'s importance to Indian culture, from ancient times, is evidenced by its presence in Ashoka\'s Rock Edict X, dated to be from 2nd century BC. The language with the largest number of speakers in India is Hindi and it\'s various dialects. Early forms of present-day Hindustani developed from the Middle Indo-Aryan "apabhraṃśa" vernaculars of present-day North India in the 7th–13th centuries. During the time of Islamic rule in parts of India, it became influenced by Persian. The Persian influence led to the development of Urdu, which is more Persianized and written in\nDocument [24] (Title: Dravidian parties) India would come to an end and Hindi would become the sole official language. However, the Madras Anti-Hindi agitation in 1965 compelled the Central Government in India change its language policy, allowing English to continue as an official language. Although DMK was not directly involved in the violence that marred the agitation, the protest itself catapulted DMK to political power in the State in the 1967 legislative elections. Annadurai became the first non-Congress Chief Minister of the post-1950 Madras state as a result. The electoral victory in 1967 led to an electoral fusion among the non-Congress parties to avoid a\nDocument [25] (Title: Languages with official status in India) offices. In 2016, the Modi government announced plans to promote Hindi in government offices in Southern and Northeast India. The Indian constitution does not specify the official languages to be used by the states for the conduct of their official functions, and leaves each state free to, through its legislature, adopt Hindi or any language used in its territory as its official language or languages. The language need not be one of those listed in the Eighth Schedule, and several states have adopted official languages which are not so listed. Examples include Kokborok in Tripura and Mizo in Mizoram. The\nDocument [26] (Title: Hindi) dialects natively. Part XVII of the Indian Constitution deals with the official language of the Indian Commonwealth. Under Article 343, the official languages of the Union has been prescribed, which includes Hindi in Devanagari script and English: (1) The official language of the Union shall be Hindi in Devanagari script. The form of numerals to be used for the official purposes of the Union shall be the international form of Indian numerals. (2) Notwithstanding anything in clause (1), for a period of fifteen years from the commencement of this Constitution, the English language shall continue to be used for all\nDocument [27] (Title: Hindi) composite culture of India and to secure its enrichment by assimilating without interfering with its genius, the forms, style and expressions used in Hindustani and in the other languages of India specified in the Eighth Schedule, and by drawing, wherever necessary or desirable, for its vocabulary, primarily on Sanskrit and secondarily on other languages. It was envisioned that Hindi would become the sole working language of the Union Government by 1965 (per directives in Article 344 (2) and Article 351), with state governments being free to function in the language of their own choice. However, widespread resistance to the imposition\nDocument [28] (Title: National Policy on Education) 14, as stipulated by the Constitution of India, and the better training and qualification of teachers. The policy called for focus on learning of regional languages, outlining the "three language formula" to be implemented in secondary education - the instruction of the English language, the official language of the state where the school was based, and Hindi, Language education was seen as essential to reduce the gulf between the intelligentsia and the masses. Although the decision to adopt Hindi as the national language had proven controversial, the policy called for use and learning of Hindi to be encouraged uniformly to\nDocument [29] (Title: Indian English) universities modelled on the University of London and using English as the medium of instruction were established in Bombay, Calcutta and Madras. During subsequent Crown Rule in India, or the British Raj, lasting from 1858 to 1947, English language penetration increased throughout India. This was driven in part by the gradually increasing hiring of Indians in the civil services. At the time of India\'s independence in 1947, English was the only functional lingua franca in the country. After Indian Independence in 1947, Hindi was declared the first official language, and attempts were made to declare Hindi the sole national language\nDocument [30] (Title: Dravida Munnetra Kazhagam) DMK gave up the separate Dravida nation idea. The Anti-Hindi agitations of 1965 forced the central government to abandon its efforts to impose Hindi as the only official language of the country; still Hindi imposition continued as Indian government employees are asked to write as much as 65% of the letters and memoranda in Hindi. In April 1974, the DMK government brought in a resolution in the House urging the Centre to accept the Rajamannar Committee recommendations on state autonomy and amend the Constitution of India to pave the way for a true federal system. The DMK and its prime\nDocument [31] (Title: Languages with official status in India) continues to be used today, in combination with other (at the central level and in some states) State official languages at the state level. The legal framework governing the use of languages for official purpose currently includes the Constitution, the Official Languages Act, 1963, Official Languages (Use for Official Purpose of the Union) Rules, 1976, and various state laws, as well as rules and regulations made by the central government and the states. The Indian constitution, in 1950, declared Hindi in Devanagari script to be the official language of the union. Unless Parliament decided otherwise, the use of English for\nDocument [32] (Title: Part XVII of the Constitution of India) of Hindi, rather than English, as the official language of India, as well as imposing restrictions on the usage of English within the official purposes of India. The chapter also writes that a committee consisting of 30 people - 20 from the Lok Sabha (lower house) and 10 from the Rajya Sabha (upper house) should be formed. This committee is charged with ensuring that the previously mentioned responsibilities are carried out by the President. Chapter 2 covers articles 345-347, and writes that the regions of India are eligible to use any of the official languages of India for official purposes.\nDocument [33] (Title: Hindi–Urdu controversy) books as Hindi books. He gives the example of the author Premchand who wrote mainly in Urdu till 1915, until he found it difficult to publish in the language. Professor Paul R. Brass notes in his book, "Language, Religion and Politics in North India", In 1837, the British East India company replaced Persian with local vernacular in various provinces as the official language of government offices and of the lower courts. However, in the northern regions of the Indian subcontinent, Urdu in Urdu script was chosen as the replacement for Persian, rather than Hindi in the Devanagari script. This was\nDocument [34] (Title: Seventy-first Amendment of the Constitution of India) Seventy-first Amendment of the Constitution of India The Seventy-first Amendment of the Constitution of India, officially known as The Constitution (Seventy-first Amendment) Act, 1992, amended the Eighth Schedule to the Constitution so as to include Konkani, Meitei (Manipuri) and Nepali languages, thereby raising the total number of languages listed in the schedule to eighteen. The Eighth Schedule lists languages that the Government of India has the responsibility to develop. The Eighth Schedule to the Constitution originally included 14 languages. Sindhi was included by the 21st Amendment, enacted in 1967. Bodo, Dogri, Santhali and Maithili were included in the Eighth Schedule\nDocument [35] (Title: Hindi–Urdu controversy) is a literary register of the Hindustani language, derived from the Khariboli dialect of the Hindi languages. A Persianized variant of Hindustani began to take shape during the Delhi Sultanate (1206–1526 AD) and Mughal Empire (1526–1858 AD) in South Asia. Urdu, along with English, became the first official language of British India in 1850. Urdu was being written, spoken and translated to and from English in all courts, schools, official documents, and government institutes. Although the need to have a language for Hindus developed in the 1850s, the irrevocable momentum of the Hindi language movement occurred around 1880. Urdu is\nDocument [36] (Title: Eighth Schedule to the Constitution of India) and Bodo, Dogri, Maithili and Santali were added in 2003 by 92nd Constitutional Amendment Act. At present, as per Ministry of Home Affairs, there are demands for inclusion of 42 more languages in the Eighth Schedule to the Constitution. These are: Eighth Schedule to the Constitution of India The Eighth Schedule to the Constitution of India lists the official languages of the Republic of India. At the time when the Constitution was enacted, inclusion in this list meant that the language was entitled to representation on the Official Languages Commission, and that the language would be one of the bases\nDocument [37] (Title: Anti-Hindi agitations of Tamil Nadu) order. On 21 February 1940, Governor Erskine issued a press communique withdrawing compulsory Hindi teaching and making it optional. During 1946–50, there were sporadic agitations against Hindi by the Dravidar Kazhagam (DK) and Periyar. Whenever the government introduced Hindi as a compulsory language in schools, anti-Hindi protests happened and succeeded in stopping the move. The largest anti-Hindi imposition agitations in this period occurred in 1948–50. After India obtained independence in 1947, the Congress Government at the Centre urged all states to make Hindi compulsory in schools. The Congress Government of Madras Presidency under Omandur Ramasamy Reddiar complied and made Hindi\nDocument [38] (Title: Anti-Hindi agitations of Tamil Nadu) framing of the Indian Constitution after India\'s independence from the United Kingdom. After an exhaustive and divisive debate, Hindi was adopted as the official language of India with English continuing as an associate official language for a period of fifteen years, after which Hindi would become the sole official language. The new Constitution came into effect on 26 January 1950. Efforts by the Indian Government to make Hindi the sole official language after 1965 were not acceptable to many non-Hindi Indian states, who wanted the continued use of English. The Dravida Munnetra Kazhagam (DMK), a descendant of Dravidar Kazhagam, led\nDocument [39] (Title: C. Rajagopalachari) January 1965 onwards, Hindi was to become the sole official language of the Indian Union and people in non-Hindi speaking regions were compelled to learn Hindi. This led to vehement opposition and just before Republic Day, severe anti-Hindi protests broke out in Madras State. Rajagopalachari had earlier been sharply critical of the recommendations made by the Official Languages Commission in 1957. On 28 January 1956, Rajagopalachari signed a resolution along with Annadurai and Periyar endorsing the continuation of English as the official language. At an All-India Language Conference held on 8 March 1958, he declared: "Hindi is as much foreign\nDocument [40] (Title: Languages of India) resolution to that effect was passed by the legislature of every state that had not adopted Hindi as its official language, and by each house of the Indian Parliament. The Constitution of India does not give any language the status of national language. Hindi, written in Devanagari script, is the most prominent language spoken in the country. In the 2001 census, 422 million (422,048,642) people in India reported Hindi to be their native language. This figure not only included Hindi speakers of Hindustani, but also people who identify as native speakers of related languages who consider their speech to be\nDocument [41] (Title: Hindi–Urdu controversy) chose to write Hindustani in Devanagari script and Muslims and some Hindus who chose to write the same in Urdu script. The development of Hindi movements in the late nineteenth century further contributed to this divergence. Sumit Sarkar notes that in the 18th and the bulk of the 19th century, "Urdu had been the language of polite culture over a big part of north India, for Hindus quite as much as Muslims". For the decade of 1881-90, Sarkar gives figures which showed that the circulation of Urdu newspapers was twice that of Hindi newspapers and there were 55% more Urdu\nDocument [42] (Title: Hindi) Hindi Hindi (Devanagari: हिन्दी, "Hindī"), or Modern Standard Hindi (Devanagari: मानक हिन्दी, "Mānak Hindī") is a standardised and Sanskritised register of the Hindustani language. Hindi written in the Devanagari script is one of the official languages of India, along with the English language. It is one of the 22 scheduled languages of the Republic of India. However, it is not the national language of India because no language was given such a status in the Indian constitution. Hindi is the "lingua franca" of the "Hindi belt", and to a lesser extent other parts of India (usually in a simplified or\nDocument [43] (Title: Twenty-first Amendment of the Constitution of India) Dogri, Santhali and Maithali in 2004, raising the total number of languages to 22. The "Constitution (Twenty-first Amendment) Bill, 1967" (Bill No. 1 of 1967) was introduced in the Rajya Sabha on 20 March 1967. It was introduced by Yashwantrao Chavan, then Minister of Home Affairs, and sought to amend the Eighth Schedule to the Constitution to include Sindhi as one of the languages listed in the schedule. The full text of the Statement of Objects and Reasons appended to the bill is given below: The Bill was considered by the Rajya Sabha on 4 April 1967 and passed in\nDocument [44] (Title: Bihari culture) in the 1961 Census. Such state and national politics are creating conditions for language endangerment. The first success for spreading Hindi occurred in Bihar in 1881, when Hindi displaced Urdu as the sole official language of the province. In this struggle between competing Hindi and Urdu, the potential claims of the three large mother tongues in the region – Bhojpuri, Maithili and Magahi were ignored. After independence Hindi was again given the sole official status through the Bihar Official Language Act, 1950. Urdu became the second official language in the undivided State of Bihar on 16 August 1989. Bihar also\nDocument [45] (Title: Languages of India) Until the Twenty-first Amendment of the Constitution of India in 1967, the country recognised 14 official regional languages. The Eighth Schedule and the Seventy-First Amendment provided for the inclusion of Sindhi, Konkani, Meitei and Nepali, thereby increasing the number of official regional languages of India to 18. The Eighth Schedule of the Constitution of India, as of 1 December 2007, lists 22 languages, which are given in the table below together with the speaking population and the regions where they are used. The individual states, the borders of most of which are or were drawn on socio-linguistic lines, can legislate\nDocument [46] (Title: Official languages of the United Nations) to have Hindi recognized as an official language of the UN. In 2015, Nepal\'s Vice President Parmananda Jha stated his firm support for the inclusion of Hindi as an official language of the UN. Because of the linguistic diversity in India, the expansion of Hindi has become political and has led to a rise of English in the post-British colonial period. This is why English is included along with Hindi on Indian passports. If the Indian government is able to grant more federal-level protection for other languages such as Bengali, Gujarati, Marathi, Tamil, Telugu and Urdu, as a compromise then\nDocument [47] (Title: Rise of Dravidian parties to power in Tamil Nadu) the situation supported Hindi as the sole official language, although members from Bengal and Madras dissented in favour of English. As of the 1961 census, there were 1652 languages. Hindi as an official language was opposed for two reasons: the first because the numbers of people with knowledge of English were fairly evenly spread, and also that imposition of Hindi would give a major advantage in terms of job and educational possibilities to those who had Hindi as their mother tongue. In effect a Tamil who would desire to pursue union civil service would have to learn three languages, Tamil,\nDocument [48] (Title: Ninety-second Amendment of the Constitution of India) teachers, and programmes on radio and television. DMK member S. Viduthalai Virumbi stated that Hindi was not a "majority language", and demanded that the official language of a State be included as official language of the country. Verumbi was interrupted by SP member Amar Singh, who objected to opposition to Hindi, saying, "I respect Tamil but to oppose Hindi is not correct." The bill received assent from then President A. P. J. Abdul Kalam on 7 January 2004, and came into force on the same date. It was notified in "The Gazette of India" on 8 January 2004. Adivasi youths\nDocument [49] (Title: Anti-Hindi agitations of Tamil Nadu) varying degrees of success. The Republic of India has hundreds of languages. During the British Raj, English was the official language. When the Indian Independence Movement gained momentum in the early part of the 20th century, efforts were undertaken to make Hindustani as a common language to unite various linguistic groups against the British Government. As early as 1918, Mahatma Gandhi established the "Dakshin Bharat Hindi Prachar Sabha" (Institution for the Propagation of Hindi in South India). In 1925, the Indian National Congress switched to Hindustani from English for conducting its proceedings. Both Gandhi and Jawaharlal Nehru were supporters of\nDocument [50] (Title: Anti-Hindi agitations of Tamil Nadu) "Dakshina Bharat Hindi Prachara Sabha" (Institution for Propagation of Hindi in South India). On 19 December 1967, the agitation was restarted. It turned violent on 21 December and acts of arson and looting were reported in the state. Annadurai defused the situation by accepting most of their demands. On 23 January 1968, a resolution was passed in the Legislative Assembly. It accomplished the following: The Three-Language Policy was scrapped and Hindi was eliminated from the curriculum. Only English and Tamil were to be taught and the use of Hindi commands in the NCC was banned. Tamil was to be introduced\nDocument [51] (Title: Urdu) policies to counter the previous emphasis on Persian. This triggered a Hindu backlash in northwestern India, which argued that the language should be written in the native Devanagari script. This literary standard called "Hindi" replaced Urdu as the official language of Bihar in 1881, establishing a sectarian divide of "Urdu" for Muslims and "Hindi" for Hindus, a divide that was formalized with the division of India and Pakistan after independence (though there are Hindu poets who continue to write in Urdu to this day, with post-independence examples including Gopi Chand Narang and Gulzar). There have been attempts to "purify" Urdu\nDocument [52] (Title: Hindi) Hindi, most notably along with Hazari Prasad Dwivedi, Kaka Kalelkar, Maithili Sharan Gupt and Seth Govind Das who even debated in Parliament on this issue. As such, on the 50th birthday of Beohar Rajendra Simha on 14 September 1949, the efforts came to fruition following the adoption of Hindi as the official language. Now, it is celebrated as Hindi Day. In Northeast India a pidgin known as Haflong Hindi has developed as a "lingua franca" for various tribes in Assam that speak other languages natively. In Arunachal Pradesh, Hindi emerged as a lingua franca among locals who speak over 50\nDocument [53] (Title: Hindi Belt) proper, Hindi can be equated with the Central Zone Indic languages. These are conventionally divided into Western Hindi and Eastern Hindi. An even narrower definition of Hindi is that of the official language, Modern Standard Hindi or "Manak Hindi", a standardised register of Hindustani, one of the varieties of Western Hindi. Standardised Hindustani—including both Manak Hindi and Urdu—is historically based on the Khariboli dialect of 17th-century Delhi. Population data from the 16th (2009) edition of "Ethnologue" is as follows, counting languages with two million or more speakers: According to the 2001 Indian census, 258 million people in India (25% of\nDocument [54] (Title: C. N. Annadurai) Parliament that English would continue to be the official language of India, as long as non-Hindi speaking people desire. DMK gave up the plan of showing black flags and Annadurai appealed to the Union Government to bring about a constitutional amendment incorporating the assurance. With no constitutional amendment done, Annadurai declared 26 January 1965, the 15th Republic Day of India and also the day the Constitution, which in essence enshrined Hindi as the official language of India, came into practice, as a day of mourning. This move was opposed by the then Chief Minister of Madras State, Bhakthavatchalam, as blasphemous.\nDocument [55] (Title: Hindi in Bihar) and "Chaitanya Chandrika" by Krishna Chaitanya Goswami, a literary figures of that time. This literary activities were not confined to Patna alone but to many districts of Bihar. Despite the large number of speakers of Bihari languages, these languages have not been constitutionally recognised in India. Hindi is the language used for educational and official matters in Bihar, with English occasionally used in education and government. These languages was legally absorbed under the subordinate label of HINDI in the 1961 Census. Such state and national politics are creating conditions for language endangerments. The first success for spreading Hindi occurred in\nDocument [56] (Title: Hindi Divas) lobbied pan-India in favor of Hindi, most notable person Beohar Rajendra Simha along with Hazari Prasad Dwivedi, Kaka Kalelkar, Maithili Sharan Gupt and Seth Govind Das who even debated in Parliament on this issue. As such, on the 50th birthday of Beohar Rajendra Simha on 14 September 1949, the efforts came to fruition following adoption of Hindi as the official language. This decision was ratified by the Constitution of India that came into effect on 26 January 1950. Under the Article 343 of the Indian Constitution, Hindi written in Devanagari script was adopted as the official language. In all, there\nDocument [57] (Title: Languages with official status in India) being carried out in Hindi. A Parliament Committee on Official Language constituted in 1976 periodically reviews the progress in the use of Hindi and submits a report to the President. The governmental body which makes policy decisions and established guidelines for promotion of Hindi is the "Kendriya Hindi Samiti" (est. 1967). In every city that has more than ten central Government offices, a Town Official Language Implementation Committee is established and cash awards are given to government employees who write books in Hindi. All Central government offices and PSUs are to establish Hindi Cells for implementation of Hindi in their\nDocument [58] (Title: S. Natarajan) conference, Annadurai expressed his dissatisfaction that the stamp contained Hindi when it was for Tamil. The Anti-Hindi agitations of 1965 forced the central government to abandon its efforts to impose Hindi as the only official language of the country; still, Hindi imposition continued as Indian government employees are asked to write as much as 65% of the letters and memoranda in Hindi. In 1967, DMK came to power in Madras province 18 years after its formation and 10 years after it had first entered electoral politics. This began the Dravidian era in Madras province which later became Tamil Nadu. In\nDocument [59] (Title: Anti-Hindi agitations of Tamil Nadu) National Language of India was further divided into two camps: the Hindi faction comprising Tandon, Ravi Shankar Shukla, Govind Das, Sampurnanand, and K. M. Munshi; and the Hindustani faction represented by Jawaharlal Nehru and Abul Kalam Azad. The adoption of Hindi as the national language was opposed by certain Constituent Assembly members from South India like T.T. Krishnamachari, G. Durgabai, T. A. Ramalingam Chettiar, N. G. Ranga, N. Gopalaswamy Ayyangar (all belonging to "Madras") and S. V. Krishnamurthy Rao (Mysore). This anti-Hindi block favoured "retaining" English as the "official" language. Their views were reflected in the following pronouncement of Krishnamachari:\nDocument [60] (Title: Part XVII of the Constitution of India) the enrichment and development of the language. Part XVII of the Constitution of India Part XVII is a compilation of laws pertaining to the constitution of India as a country and the union of states that it is made of. This part of the constitution consists of Articles on Official Language. Chapter 1 covers articles 343 and 344, and settles Hindi written in Devanagari script as the official language of India. It also writes that the international form of the indian numerals should be used as the official numeral system. It also places a responsibility on the President of India\nDocument [61] (Title: Anti-Hindi agitations of Tamil Nadu) as the medium of instruction in all colleges and as the "language of administration" within five years, the Central Government was urged to end the special status accorded to Hindi in the Constitution and "treat all languages equally", and was urged to provide financial assistance for development of all languages mentioned in the Eighth Schedule of the Constitution. These measures satisfied the agitators and "normalcy" returned by February 1968. In 1986, Indian Prime minister Rajiv Gandhi introduced the "National Education Policy". This education policy provided for setting up Navodaya Schools, where the DMK claimed teaching of Hindi would be compulsory.\nDocument [62] (Title: C. N. Annadurai) in Kanchipuram on 27 February 1938. Two members of the protest, Thalamuthu and Natarajan, died as a consequence of police beating the same year. With overwhelming opposition, the government of Madras Presidency finally withdrew the order in 1940. When India became a republic with its own constitution in 1950, the constitution had given special status to the Hindi language, which was to gain official status after 15 years in 1965. This move was regarded with anxiety by students in Tamil Nadu. Speaking of making Hindi as official language of India, Annadurai said "It is claimed that Hindi should be the\nDocument [63] (Title: Languages with official status in India) a grievance to a government officer or authority has a constitutional right to submit it in any language used in India. Various steps have been taken by the Indian government to implement the use and familiarisation of Hindi extensively. Dakshina Bharat Hindi Prachar Sabha headquartered at Chennai was formed to spread Hindi in South Indian states. Regional Hindi implementation offices at Bengaluru, Thiruvananthapuram, Mumbai, Kolkata, Guwahati, Bhopal, Delhi and Ghaziabad have been established to monitor the implementation of Hindi in Central government offices and PSUs. Annual targets are set by the Department of Official Language regarding the amount of correspondence\nDocument [64] (Title: C. Rajagopalachari) to non-Hindi speaking people as English [is] to the protagonists of Hindi". When the Anti-Hindi agitations broke out in 1965, Rajagopalachari completely reversed his 1938 support for the introduction of Hindi and took a strongly anti-Hindi stand in support of the protests, On 17 January 1965, he convened the Madras state Anti-Hindi conference in Tiruchirapalli. angrily declaring that Part XVII of the Constitution of India which declared that Hindi was the official language should "be heaved and thrown into the Arabian Sea." The fourth elections to the Madras Legislative assembly were held in February 1967. At the age of 88,\nDocument [65] (Title: Languages of India) a dialect of Hindi, the Hindi belt. Hindi (or Hindustani) is the native language of most people living in Delhi, Uttar Pradesh, Uttarakhand, Chhattisgarh, Himachal Pradesh, Chandigarh, Bihar, Jharkhand, Madhya Pradesh, Haryana, and Rajasthan. "Modern Standard Hindi", a standardised language is the official language of the Union of India. In addition, it is one of only two languages used for business in Parliament however the Rajya Sabha now allows all 22 official languages on the Eighth Schedule to be spoken. Hindustani, evolved from "khari boli" (खड़ी बोली), a prominent tongue of Mughal times, which itself evolved from Apabhraṃśa, an intermediary\nDocument [66] (Title: Languages with official status in India) official purposes was to cease 15 years after the constitution came into effect, i.e., on 26 January 1965. The prospect of the changeover, however, led to much alarm in the non Hindi-speaking areas of India, especially Dravidian-speaking states whose languages were not related to Hindi at all. As a result, Parliament enacted the , In late 1964, an attempt was made to expressly provide for an end to the use of English, but it was met with protests from states such as Maharashtra, Tamil Nadu, Punjab, West Bengal, Karnataka, Puducherry and Andhra Pradesh. Some of these protests also turned violent.\nDocument [67] (Title: Anti-Hindi agitations of Tamil Nadu) jobs, civil service examinations and the fear that English would be replaced with Hindi as medium of instruction brought students into the anti-Hindi imposition agitation camp in large numbers. On 7 March 1964, the chief minister of Madras State, M. Bhaktavatsalam at a session of the Madras Legislative Assembly recommended the introduction of Three-language formula (English, Hindi and Tamil) in the state. Apprehension over the Three-language formula increased student support for the anti-Hindi cause. As 26 January 1965 approached, the anti-Hindi imposition agitation in Madras State grew in numbers and urgency. The Tamil Nadu Students Anti Hindi Agitation Council was\nDocument [68] (Title: Linguistic history of India) Khariboli with the name \'Gora Badal ki Katha\'. The establishment of British rule in the subcontinent saw the clear division of Hindi and Urdu registers. This period also saw the rise of modern Hindi literature starting with Bharatendu Harishchandra. This period also shows further Sanskritization of the Hindi language in literature. Hindi is right now the official language in nine states of India— Uttar Pradesh, Bihar, Rajasthan, Jharkhand, Madhya Pradesh, Chhattisgarh, Uttarakhand, Haryana and Himachal Pradesh—and the National Capital Territory of Delhi. Post-independence Hindi became the official language of the Central Government of India along with English. Urdu has been\nDocument [69] (Title: Urdu movement) of Urdu\'s counterpart Hindi as the Indian union\'s official language since 1950, the higher number of Hindi-speakers in India leading to the suppression of Urdu, the partition of India and Urdu\'s national language status in Pakistan overshadowing the language\'s prospects in India, as well as the lack of many schools in India teaching Urdu as a medium of instruction. Urdu movement The Urdu movement was a socio-political movement aimed at making Urdu the universal language and symbol of the cultural and political identity of the Muslim communities of the South Asia during the British Raj. The movement began with the\nDocument [70] (Title: Sivakant Tiwari) study of Hindi, and met the Minister for Education Dr. Tony Tan Keng Yam to request that Hindi be allowed to be learned as a second language by students. On 6 October 1989, the Minister announced that Hindi – along with Bengali, Gujarati, Punjabi and Urdu – would be offered as a second language in secondary schools up to the O-level standard. While the Ministry would provide premises for the lessons, students had to make their own arrangements for teachers. The "pro tem" committee arranged for the first Hindi classes for secondary school students on 21 January 1990 and, after\nDocument [71] (Title: Languages with official status in India) purposes of each state and union territory in the country, and the languages that are to be used for communication between the union and the states "inter se". During the British Raj, English was used for purposes at the federal level. The Indian constitution adopted in 1950 envisaged that Hindi would be gradually phased in to replace English over a fifteen-year period, but gave Parliament the power to, by law, provide for the continued use of English even thereafter. Plans to make Hindi the sole official language of the Republic met with resistance in some parts of the country. Hindi\nDocument [72] (Title: Hindi–Urdu controversy) Hindi and Urdu started to diverge linguistically, with Hindi drawing on Sanskrit as the primary source for formal and academic vocabulary, often with a conscious attempt to purge the language of Persian-derived equivalents. Deploring this Hindu-Muslim divide, Gandhi proposed re-merging the standards, using either Devanagari or Urdu script, under the traditional generic term Hindustani. Bolstered by the support of the Indian National Congress and various leaders involved in the Indian Independence Movement, Hindi, in the Devanagari script, along with English, replaced Urdu as the official language of India during the institution of the Indian constitution in 1950. The main cause\nDocument [73] (Title: Hindustani language) the majority of the population, Urdu is the "lingua franca". "Hindustani" was the official language of the British Raj and was synonymous with both Hindi and Urdu. After India\'s independence in 1947, the Sub-Committee on Fundamental Rights recommended that the official language of India be Hindustani: "Hindustani, written either in Devanagari or the Perso-Arabic script at the option of the citizen, shall, as the national language, be the first official language of the Union." However, this recommendation was not adopted by the Constituent Assembly. Besides being the "lingua franca" of North India and Pakistan in South Asia, Hindustani is also\nDocument [74] (Title: I. Panduranga Rao) working as a mathematics teacher and Hindi lecturer from 1950 to 1957 at Chittoor, Nellore, Guntur and Rajahmundry, Rao joined government service in 1957. He worked with the State Government of Andhra Pradesh as Hindi Education Officer (1957–60), and with the Government of India as the Regional Officer, Eastern Region (covering the eastern and north eastern states of the country with headquarters at Kolkata) under the Central Government Hindi Teaching Scheme (1960–64), both newly created positions to popularize India’s national language involving curriculum design, preparation of textbooks and professional development of teachers. Around 1964, there was a national debate on\nDocument [75] (Title: Uttar Pradesh) Dwivedi, Acharya Kuber Nath Rai, Bharatendu Harishchandra, Kamleshwar Prasad Saxena, Shivmangal Singh Suman, Mahadevi Varma, and Vibhuti Narain Rai. The state is sometimes called the \'Hindi heartland of India\'. Hindi became the language of state administration with the Uttar Pradesh Official Language Act of 1951. A 1989 amendment to the act added Urdu, as an additional language of the state. Linguistically, the state spreads across the Central, East-Central, and Eastern zones of the Hindi Belt, the major Hindi dialects of the state being Awadhi, Bhojpuri, Bundeli, Braj Bhasha, Kannauji and the vernacular form of Khariboli. Uttar Pradesh has produced musicians,\nDocument [76] (Title: John Gilchrist (linguist)) as the administrative language in a growing colonial bureaucracy. In 1803, Gilchrist inducted other writers into the college, who helped make rapid strides in Hindi language and literature. Subsequently, a Hindi translation of the Bible appeared in 1818 and "Udant Martand", the first Hindi newspaper, was published in 1826 in Calcutta. Scholars debate Gilchrist\'s role in the distillation of Hindustani into the modern languages of Hindi and Urdu, but according to Gilchrist, the rise of the new prose tradition was also the "bifurcation of Khariboli into two forms – the Hindustani language with Khariboli as the root resulted in two\nDocument [77] (Title: Anti-Hindi agitations of Tamil Nadu) Provinces), Babunath Gupta (Bihar), Hari Vinayak Pataskar (Bombay) and Ravi Shankar Shukla, Seth Govind Das (Central Provinces and Berar). They moved a large number of pro-Hindi amendments and argued for adopting Hindi as the sole National Language. On 10 December 1946, Dhulekar declared "People who do not know Hindustani have no right to stay in India. People who are present in the House to fashion a constitution for India and do not know Hindustani are not worthy to be members of this assembly. They had better leave." The members of the Constituent Assembly in favor of recognizing Hindi as the\nDocument [78] (Title: S. Natarajan) black flags to leading functionaries, to be shown to the President of India during his visit to the state. Sensing an uprising, Prime Minister Jawaharlal Nehru assured in the Parliament that English would continue to be the official language of India, as long as non-Hindi speaking people desire. DMK gave up the plan of showing black flags and Annadurai appealed to the Union Government to bring about a constitutional amendment incorporating the assurance. With no constitutional amendment done, Annadurai declared 26 January 1965, the 15th Republic Day of India and also the day the Constitution, which in essence enshrined Hindi\nDocument [79] (Title: Anti-Hindi agitations of Tamil Nadu) DMK convened an anti-Hindi Conference to protest against the imposition of Hindi. It observed 13 October 1957 as "anti-Hindi Day". On 31 July 1960, another open air anti-Hindi conference was held at Kodambakkam, Madras. In November 1963, DMK dropped its secessionist demand in the wake of the Sino-Indian War and the passage of the anti-secessionist 16th Amendment to the Indian Constitution. But the anti-Hindi stance remained and hardened with the passage of Official Languages Act of 1963. The DMK\'s view on Hindi\'s qualifications for official language status were reflected in Annadurai\'s response to the "numerical superiority of Hindi" argument: "If\nDocument [80] (Title: Languages of India) as a primary language and language of instruction, and as a second tongue in most states. British colonial legacy has resulted in English being a language for government, business and education. English, along with Hindi, is one of the two languages permitted in the Constitution of India for business in Parliament. Despite the fact that Hindi has official Government patronage and serves as a "lingua franca" over large parts of India, there was considerable opposition to the use of Hindi in the southern states of India, and English has emerged as a "de facto" "lingua franca" over much of India.\nDocument [81] (Title: Hindi Divas) Hindi Divas Hindi Divas (; ) is an annual day celebrated on 14 September in India to celebrate the popularity of Hindi as an official language of India. Mostly this celebration is a government sponsored event in Central government, but many private organizations and voluntary groups also celebrate the day. Hindi Divas is celebrated on 10 January because on this day in 1949, the Constituent Assembly of India had adopted Hindi written in Devanagari script as the official language of the Republic of India. But this didn\'t come as easily as it sounds. To this end, several stalwarts rallied and\nDocument [82] (Title: Jawaharlal Nehru) would continue to be used as the official language as long the non-Hindi speaking states wanted. The Official Languages Act was eventually amended in 1967 by the Congress Government headed by Indira Gandhi to guarantee the indefinite use of Hindi and English as official languages. This effectively ensured the current "virtual indefinite policy of bilingualism" of the Indian Republic. Nehru led newly independent India from 1947 to 1964, during its first years of independence from British rule. Both the United States and the Soviet Union competed to make India an ally throughout the Cold War. Nehru also maintained good relations\nDocument [83] (Title: World Hindi Secretariat) Sharma, Secretary (PCD), Government of India. Proclamation of World Hindi Secretariat Act (2005) The Proclamation of the World Hindi Secretariat Act passed in Mauritius National Assembly in November 2002 was announced on 12 September 2005. Official Hindi Secretariat Official Workshop (2008) On 11 February 2008, the World Hindi Secretariat officially started functioning. The functioning was led by the Secretary General Dr. Mrs Vinod Bala Arun and the Deputy General Secretary Dr. Rajendra Prasad Mishra. In 2012, a proposal was passed on the concluding day of the Ninth World Hindi Conference in Johannesburg to make Hindi one of the official languages\nDocument [84] (Title: Hindi–Urdu controversy) jobs, which eventually took on a communal form. In 1867, some Hindus in the United Provinces of Agra and Oudh during the British Raj in India began to demand that Hindi be made an official language in place of Urdu. Babu Shiva Prasad of Banares was one of the early proponents of the Nagari script. In a "Memorandum on court characters" written in 1868, he accused the early Muslim rulers of India for forcing them to learn Persian. In 1897, Madan Mohan Malaviya published a collection of documents and statements titled "Court character and primary education in North Western Provinces\nDocument [85] (Title: Linguistic rights) comprehensive school education in each language, in municipalities with both Finnish- and Swedish-speaking children, as long as there is a minimum of 13 students from the language community of that mother tongue. The constitution of India was first drafted on January 26, 1950. It is estimated that there are about 1500 languages in India. Article 343–345 declared that the official languages of India for communication with centre will be Hindi and English. There are 22 official languages identified by constitution. Article 345 states that "the Legislature of a state may by law adopt any one or more of the languages\nDocument [86] (Title: Hindi Divas) changed to "Rajbhasha Gaurav Puraskar". Hindi Divas Hindi Divas (; ) is an annual day celebrated on 14 September in India to celebrate the popularity of Hindi as an official language of India. Mostly this celebration is a government sponsored event in Central government, but many private organizations and voluntary groups also celebrate the day. Hindi Divas is celebrated on 10 January because on this day in 1949, the Constituent Assembly of India had adopted Hindi written in Devanagari script as the official language of the Republic of India. But this didn\'t come as easily as it sounds. To this\nDocument [87] (Title: Ghulam Rabbani Agro) into his language around AD 882. The language is called ‘Hindi’ by Arab historians (in this case the author of Ajaib ul Hind) who often failed to distinguish between the different languages of India and put them all under 22 Rahman the generic name of ‘Hindi.’ However, Syed Salman Nadwi, who calls this the first translation of the Quran into any Indian language suggests that this language might be Sindhi. Later, between 1020-1030 Al-Beruni visited India and wrote a book on it called "Kitab Ma-li al Hind " which was translated by Edward C Sachau as " Alberuni’s India" (1888).\nDocument [88] (Title: C. Rajagopalachari) military force against Portugal to capture the Portuguese enclave of Goa was criticised by Rajagopalachari who said of the operation and subsequent acts of international diplomacy, "India has totally lost the moral power to raise her voice against the use of military power." On 26 January 1950, the Government of India adopted Hindi as the official language of the country, but because of objections in non-Hindi-speaking areas, it introduced a provision tentatively making English the second official language on a par with Hindi for a stipulated fifteen-year period to facilitate a switch to Hindi in non-Hindi speaking states. From 26\nDocument [89] (Title: Hindi media) Bengal Gazette"), started in January 1780, and the first Hindi daily, "Samachar Sudha Varshan", started in 1854 - three years before the Indian Rebellion of 1857. On 30 May 1826, "Udant Martand" (The Rising Sun), the first Hindi-language newspaper published in India, started from Calcutta (now Kolkata), published every Tuesday by Pt. Jugal Kishore Shukla. Hindi news media has a dominant presence in a large part of the country. India is a multi-lingual country, and the Hindi belt is a group of states which are predominantly Hindi-speaking. The Hindi belt is commonly understood to include states of Delhi, Punjab, Haryana,\nDocument [90] (Title: Anti-Hindi agitations of Tamil Nadu) riots subsided after Shastri\'s assurance, as did the student agitation. The agitations of 1965 led to major political changes in the state. The DMK won the 1967 assembly election and the Congress Party never managed to recapture power in the state since then. The Official Languages Act was eventually amended in 1967 by the Congress Government headed by Indira Gandhi to guarantee the indefinite use of Hindi and English as official languages. This effectively ensured the current "virtual indefinite policy of bilingualism" of the Indian Republic. There were also two similar (but smaller) agitations in 1968 and 1986 which had\nDocument [91] (Title: Anti-Hindi agitations of Tamil Nadu) end of 1949. It was called the Munshi-Ayyangar formula (after K.M. Munshi and Gopalaswamy Ayyangar) and it struck a balance between the demands of all groups. of the Indian Constitution was drafted according to this compromise. It did not have any mention of a "National Language". Instead, it defined only the "Official Languages" of the Union: Hindi in Devanagari script would be the official language of the Indian Union. For fifteen years, English would also be used for all official purposes (Article 343). A language commission could be convened after five years to recommend ways to promote Hindi as the\nDocument [92] (Title: Hindi–Urdu controversy) independence movement paved its status as an alternative official language of independent India. Hindi was supported by religious and political leaders, social reformers, writers and intellectuals during independence movement securing that status. Hindi, along with English, was recognised as the official language of India during the institution of the Indian constitution in 1950. "Indo-Persian Literature and Amir Khusrau, University of Delhi - PDF Newspaper Hindi–Urdu controversy The Hindi–Urdu controversy is an ongoing dispute—dating back to the 19th century—regarding the status of Hindi and Urdu as a single language, Hindustani (lit "of "Hindustan""), or as two dialects of a single language,\nDocument [93] (Title: Jawaharlal Nehru) peoples of the scheduled castes and scheduled tribes. Nehru also championed secularism and religious harmony, increasing the representation of minorities in government. Nehru led the faction of the Congress party which promoted Hindi as the lingua-franca of the Indian nation. After an exhaustive and divisive debate with the non-Hindi speakers, Hindi was adopted as the official language of India in 1950 with English continuing as an associate official language for a period of fifteen years, after which Hindi would become the sole official language. Efforts by the Indian Government to make Hindi the sole official language after 1965 were not\nDocument [94] (Title: S. Natarajan) regarded with anxiety by students in Tamil Nadu. Speaking of making Hindi as an official language of India, Annadurai said "It is claimed that Hindi should be the common language because it is spoken by the majority. Why should we then claim the tiger as our national animal instead of the rat which is so much more numerous? Or the peacock as our national bird when the crow is ubiquitous?". In view of the continued threat to impose Hindi, the DMK held an open-air conference against Hindi imposition at Kodambakkam, Chennai in August 1960, which Annadurai presided over. He gave\nDocument [95] (Title: Anti-Hindi agitation of 1937–40) as a common language to unite various linguistic groups against the British Government. As early as 1918, Mahatma Gandhi established the "Dakshin Bharat Hindi Prachar Sabha" (Institution for the Propagation of Hindi in South India). In 1925, the Indian National Congress switched to Hindustani from English for conducting its proceedings. Both Gandhi and Jawaharlal Nehru were supporters of Hindustani and Congress wanted to propagate the learning of Hindustani in non-Hindi speaking Provinces of India. The idea of making Hindustani or Hindi the common language was not acceptable to Periyar, who viewed it as an attempt to make Tamils subordinate to\nDocument [96] (Title: Hindi in Bihar) Bihar in 1881, when Hindi displaced Urdu as the sole official language of the province. In this struggle between competing Hindi and Urdu, the potential claims of the three large mother tongues in the region - Magahi, Bhojpuri and Maithili were ignored. After independence Hindi was again given the sole official status through the Bihar Official Language Act, 1950. Urdu became the second official language in the undivided State of Bihar on 16 August 1989. Maithili has been the only one among the Bihari languages, which has been trying to constantly deny superimposition of Hindi over its identity. The other\nDocument [97] (Title: Jawaharlal Nehru) acceptable to many non-Hindi Indian states, who wanted the continued use of English. The Dravida Munnetra Kazhagam (DMK), a descendant of Dravidar Kazhagam, led the opposition to Hindi. To allay their fears, Nehru enacted the Official Languages Act in 1963 to ensure the continuing use of English beyond 1965. The text of the Act did not satisfy the DMK and increased their scepticism that his assurances might not be honoured by future administrations. The issue was resolved during the premiership of Lal Bahadur Shastri, who under great pressure from Nehru\'s daughter, Indira Gandhi, was made to give assurances that English\nDocument [98] (Title: Languages of India) by the members from those parts of India where Hindi was not spoken natively. Eventually, a compromise was reached with Hindi in Devanagari script to be the official language of the union but for "fifteen years from the commencement of the Constitution, the English Language shall continue to be used for all the official purposes of the Union for which it was being used immediately before such commencement". Article 343 (1) of the Constitution of India states "The Official Language of the Union government shall be Hindi in Devanagari script." Unless Parliament decided otherwise, the use of English for official\nDocument [99] (Title: Ravishankar Shukla) as the national language. In 1916 was held the seventh session of Antar Bhartiya Sammelan of Hindi at Jabalpur, in which Pt. Shukla participated. After its success, he launched Madhya Pradesh Hindi Sahitya Sammelan and held its first session at Raipur in 1918. In the fifth session of the Sahitya Sammelan, held at Nagpur in March 1922, he for the first time proposed to incorporate Hindi instead of English as Raj Bhasha. A British Minister, Sir Montegue was sent to India in 1917 to assess the situation of unrest arising in India after the first world-war. To express the public\nDocument [100] (Title: Hindi) a standardised form of Hindustani separate from Urdu took form. In 1881, Bihar accepted Hindi as its sole official language, replacing Urdu, and thus became the first state of India to adopt Hindi. Modern Standard Hindi is one of the youngest Indian languages in this regard. After independence, the government of India instituted the following conventions: On 14 September 1949, the Constituent Assembly of India adopted Hindi written in the Devanagari script as the official language of the Republic of India replacing Urdu\'s previous usage in British India. To this end, several stalwarts rallied and lobbied pan-India in favor of\nDocument [101] (Title: Magahi language) setback to local languages as its Persian script was alien to local people. The first success for spreading Hindi occurred in Bihar in 1881, when Hindi displaced Urdu as the official language of the province. After independence, Hindi was given the sole official status through the Bihar Official Language Act, 1950. There are several dialects of Magadhi. It is spoken in the area which formed the core of the ancient kingdom of Magadha - the modern districts of Patna, Nalanda, Gaya, Jehanabad, Arwal, Aurangabad, Lakhisarai, Sheikhpura and Nawada and Munger. Magahi is bounded on the north by the various forms\nDocument [102] (Title: Anti-Hindi agitations of Tamil Nadu) Sen, Satyanarayan Sinha, Mahavir Tyagi, M. C. Chagla and S. K. Patil and but no southern members debated the issue and could not come to any agreement. The sub-committee recommended the continuation of English and Hindi as joint link languages and was not in favour of either quota system or use of regional languages in public services exams. They drafted an amendment to Official Languages Act incorporating Nehru\'s assurances explicitly. This Bill guaranteeing the use of English in inter-state and state-Union communications as long as desired by Non-Hindi states was approved for discussion by the Speaker on 25 August. But\nDocument [103] (Title: Anti-Hindi agitations of Tamil Nadu) Article 343, Nehru appointed the First Official Language Commission under the chairmanship of B. G. Kher on 7 June 1955. The commission delivered its report on 31 July 1956. It recommended a number of steps to eventually replace English with Hindi (the report registered "dissenting notes" from two members – P. Subbarayan from Madras State and Suniti Kumar Chatterji from West Bengal). The Parliamentary Committee on Official Language, chaired by Govind Ballabh Pant was constituted in September 1957 to review the Kher commission report. After two years of deliberations, the Pant Committee submitted its recommendations to the President on 8\nDocument [104] (Title: Bihari languages) Such state and national politics are creating conditions for language endangerments. After independence Hindi was given the sole official status through the Bihar Official Language Act, 1950. Hindi was displaced as the sole official language of Bihar in 1981, when Urdu was accorded the status of the second official language. The number of speakers of Bihari languages is difficult to indicate because of unreliable sources. In the urban region most educated speakers of the language name Hindi as their language because this is what they use in formal contexts and believe it to be the appropriate response because of unawareness.\nDocument [105] (Title: Twenty-first Amendment of the Constitution of India) Twenty-first Amendment of the Constitution of India The Twenty-first Amendment of the Constitution of India, officially known as The Constitution (Twenty-first Amendment) Act, 1967, amended the Eighth Schedule to the Constitution so as to include Sindhi as one of the languages, thereby raising the total number of languages listed in the schedule to fifteen. The Eighth Schedule lists languages that the Government of India has the responsibility to develop. The Eighth Schedule to the Constitution originally included 14 languages. The 71st Amendment, enacted in 1992, included three more languages, i.e. Konkani, Meitei (Manipuri) and Nepali. The 92nd Amendment, added Bodo,\nDocument [106] (Title: Lingua franca) language." The order in non-Hindi speaking states is: "(a) the regional language; (b) Hindi; (c) Urdu or any other modern Indian language excluding (a) and (b); and (d) English or any other modern European language." Hindi has also emerged as a lingua franca for the locals of Arunachal Pradesh, a linguistically diverse state in Northeast India.It is estimated that 90 percent of the state\'s population knows Hindi. The only documented sign language used as a lingua franca is Plains Indian Sign Language, used across much of North America. It was used as a second language across many indigenous peoples. Alongside\nDocument [107] (Title: Anti-Hindi agitations of Tamil Nadu) Hindi education to government employees and other efforts for propagating Hindi. To give legal status to Nehru\'s assurance of 1959, the was passed in 1963. In Nehru\'s own words: This is a Bill, in continuation of what has happened in the past, to remove a restriction which had been placed by the Constitution on the use of English after a certain date i.e. 1965. It is just to remove that restriction that this is placed. The Bill was introduced in Parliament on 21 January 1963. Opposition to the Bill came from DMK members who objected to the usage of the\nDocument [108] (Title: Eighth Schedule to the Constitution of India) Eighth Schedule to the Constitution of India The Eighth Schedule to the Constitution of India lists the official languages of the Republic of India. At the time when the Constitution was enacted, inclusion in this list meant that the language was entitled to representation on the Official Languages Commission, and that the language would be one of the bases that would be drawn upon to enrich Hindi, the official language of the Union. The list has since, however, acquired further significance. The Government of India is now under an obligation to take measures for the development of these languages, such\nDocument [109] (Title: Hindi) the Hindi prose literature was Munshi Premchand, who is considered as the most revered figure in the world of Hindi fiction and progressive movement. Literary, or "Sāhityik", Hindi was popularised by the writings of Swami Dayananda Saraswati, Bhartendu Harishchandra and others. The rising numbers of newspapers and magazines made Hindustani popular with the educated people. The "Dvivedī Yug" ("Age of Dwivedi") in Hindi literature lasted from 1900 to 1918. It is named after Mahavir Prasad Dwivedi, who played a major role in establishing Modern Standard Hindi in poetry and broadening the acceptable subjects of Hindi poetry from the traditional ones\nDocument [110] (Title: Hindi–Urdu controversy) Hindi–Urdu controversy The Hindi–Urdu controversy is an ongoing dispute—dating back to the 19th century—regarding the status of Hindi and Urdu as a single language, Hindustani (lit "of "Hindustan""), or as two dialects of a single language, and the establishment of a single standard language in certain areas of North India. Although this debate was officially settled by a government order in 1950, declaring Hindi as the official language, some resistance remains. The present notion among some Muslims about this dispute is that Hindus abandoned the Urdu language, whereas some Hindus claim that Urdu was artificially created during Muslim rule. Hindi\nDocument [111] (Title: Hindi–Urdu controversy) Perso-Arabic scripts. This decree evoked protests from Urdu supporters and joy from Hindi supporters. However, the order was more symbolic in that it did not provision exclusive use of Nagari script. Perso-Arabic remained dominant in North-Western provinces and Oudh as the preferred writing system until independence. C. Rajagopalachari, chief minister of Madras Presidency introduced Hindi as a compulsory language in secondary school education though he later relented and opposed the introduction of Hindi during the Madras anti-Hindi agitation of 1965. Bal Gangadhar Tilak supported Devanagari script as the essential part of nationalist movement. The language policy of Congress and the\nDocument [112] (Title: Languages of India) must for officials. Hindustani was made the second language of British Indian Empire after English and considered as the language of administration. The British introduced the use of Roman script for Hindustani as well as other languages. Urdu had 70 million speakers in India (as per the Census of 2001), and, along with Hindi, is one of the 22 officially recognised regional languages of India and also an official language in the Indian states of Jammu and Kashmir, Delhi, Uttar Pradesh, Bihar and Telangana that have significant Muslim populations. Gujarati is an Indo-Aryan language. It is native to the west\nDocument [113] (Title: Hindi) of Hindi on non-native speakers, especially in South India (such as the those in Tamil Nadu) led to the passage of the Official Languages Act of 1963, which provided for the continued use of English indefinitely for all official purposes, although the constitutional directive for the Union Government to encourage the spread of Hindi was retained and has strongly influenced its policies. At the state level, Hindi is the official language of the following Indian states: Bihar, Chhattisgarh, Haryana, Himachal Pradesh, Jharkhand, Madhya Pradesh, Mizoram, Rajasthan, Uttar Pradesh, Uttarakhand and West Bengal. Each may also designate a "co-official language"; in\nDocument [114] (Title: C. N. Annadurai) in India by a committee headed by Motilal Nehru in 1928. This move was opposed by people and politicians of Tamil Nadu, since they considered that it would make them second class citizens when compared to that of native Hindi speaking North Indians. In 1938, the Congress government in Madras Presidency headed by C. Rajagopalachari (popularly known as Rajaji) proposed the use of Hindi language as a compulsory language in schools. This move was opposed by Tamil leaders. Annadurai, along with other Tamil enthusiasts including the poet Bharathidasan, held demonstrations. Annadurai participated in the first Anti Hindi imposition conference held\nDocument [115] (Title: Indian English) of India. Due to protests from Tamil Nadu and other non-Hindi-speaking states, it was decided to temporarily retain English for official purposes until at least 1965. By the end of this period, however, opposition from non-Hindi states was still too strong to have Hindi declared the sole language. With this in mind, the English Language Amendment Bill declared English to be an associate language "until such time as all non-Hindi States had agreed to its being dropped." This has not yet occurred, and it is still widely used. For instance, it is the only reliable means of day-to-day communication between\nDocument [116] (Title: Hindi Divas) are 22 Scheduled languages of India, of which, two are officially used at Union government of India level: Hindi and English. Hindi is spoken by 250 million people as the original language and it is the fourth language of the world. Apart from local-level events in schools and other institutions, a few of the notable events include Ministry of Home Affairs in its order dated 25 March 2015 has changed name of two awards given annually on Hindi Divas. \'Indira Gandhi Rajbhasha Puraskar\' instituted in 1986 changed to \'Rajbhasha Kirti Puraskar\' and \'Rajiv Gandhi Rashtriya Gyan-Vigyan Maulik Pustak Lekhan Puraskar\'\nDocument [117] (Title: Ravishankar Shukla) national language, for the pride of the country and the natural convenience it could bring to the child entering school vis a vis English as a medium of teaching. He always took the lead and constantly pursued the cause of Hindi as the national language. No one had realised it then that, the impact of the zest of Pandit Shukla for incorporating Hindi into the mainstream of the nation as a working language throughout educational, administrative and legislative wings, would in future see almost all government offices in the country using the language. The parliament of India then adopted Hindi\nDocument [118] (Title: History of Hindustani) them, and it was influenced there by more southerly languages, producing the Dakhini dialect. During this time Hindustani was the language of both Hindus and Muslims. The communal nature of the language lasted until the British Raj, when in 1837 Hindustani in the Persian script (i.e. Urdu) replaced Persian as the official language and was made co-official along with English. This triggered a Hindu backlash in northwestern India, which argued that the language should be written in the native Devanagari script. This literary standard, called simply "Hindi", replaced Hindustani/Urdu as the official register of Bihar in 1881, establishing a sectarian\nDocument [119] (Title: Moturi Satyanarayana) India\'s political history. However, some people criticise his decision to support Hindi, and not English, as the official language of India. The awkward provisions in the Indian Constitution and a later push by pro-Hindi hardliners led to "language riots" in the 1960s in southern states such as Tamil Nadu, where more than 60 people died. Several students immolated themselves protesting the forced use of Hindi as the national language. While English has become the de facto national language of India by 2010, the Indian Constitution has continued to serve as the fundamental basis of the India\'s polity and society and\nDocument [120] (Title: Languages with official status in India) the time the constitution was enacted, inclusion in this list meant that the language was entitled to representation on the Official Languages Commission, and that the language would be one of the bases that would be drawn upon to enrich Hindi, the official language of the Union. The list has since, however, acquired further significance. The Government of India is now under an obligation to take measures for the development of these languages, such that "they grow rapidly in richness and become effective means of communicating modern knowledge." In addition, a candidate appearing in an examination conducted for public service\nDocument [121] (Title: Hindi) Spanish and English. Alongside Urdu as Hindustani, it is the third most-spoken language in the world, after Mandarin and English. The term "Hindī" originally was used to refer to inhabitants of the region east of the Indus. It was borrowed from Classical Persian "Hindī" (Iranian Persian "Hendi"), meaning "Indian", from the proper noun "Hind" "India". The name "Hindavī" was used by Amir Khusrow in his poetry. Like other Indo-Aryan languages, Hindi is a direct descendant of an early form of Vedic Sanskrit, through Sauraseni Prakrit and Śauraseni Apabhraṃśa (from Sanskrit "apabhraṃśa" "corrupted"), which emerged in the 7th century A.D. Modern\nDocument [122] (Title: Linguistic history of India) Dhar (9th century CE), Pushpadanta of Manikhet (9th century CE), Dhanapal, Muni Ramsimha, Hemachandra of Patan, Raighu of Gwalior (15th century CE). An early example of the use of Apabhraṃśa is in Vikramūrvashīiya of Kalidasa, when Pururava asks the animals in the forest about his beloved who had disappeared. Hindustani is right now the most spoken language in the Indian subcontinent and the fourth most spoken language in the world. The development of Hindustani revolves around the various Hindi dialects originating mainly from Sauraseni Apabhramsha. A Jain text Shravakachar written in 933AD is considered the first Hindi book. Modern Hindi\nDocument [123] (Title: Languages of India) purposes was to cease 15 years after the constitution came into effect, i.e. on 26 January 1965. As the date for changeover approached, however, there was much alarm in the non Hindi-speaking areas of India, especially in Kerala, Gujarat, Maharashtra, Tamil Nadu, Punjab, West Bengal, Karnataka, Puducherry and Andhra Pradesh. Accordingly, Jawaharlal Nehru ensured the enactment of the , which provided that English "may" still be used with Hindi for official purposes, even after 1965. The wording of the text proved unfortunate in that while Nehru understood that "may" meant shall, politicians championing the cause of Hindi thought it implied\nDocument [124] (Title: Hindi–Urdu controversy) "Hindi" and "Urdu" protagonists each advocating the official use of Hindustani with the Devanagari script or with the Nastaʿlīq script, respectively. Hindi movements advocating the growth of and official status for Devanagari were established in Northern India. Babu Shiva Prasad and Madan Mohan Malaviya were notable early proponents of this movement. This, consequently, led to the development of Urdu movements defending Urdu\'s official status; Syed Ahmed Khan was one of its noted advocates. In 1900, the Government issued a decree granting symbolic equal status to both Hindi and Urdu, which was opposed by Muslims and received with jubilation by Hindus.\nDocument [125] (Title: Official languages of the United Nations) of the official languages of India and Fiji and is also spoken in Suriname, Mauritius, Trinidad and Tobago and Guyana. It is mutually intelligible to a high degree with Urdu which is spoken in Pakistan and together they are often considered the same language, referred to as Hindustani or Hindi-Urdu. Although very similar verbally, they do have different written scripts; Hindi is written in the Devanagari script and Urdu is written in the Nastaʿlīq script. Hindi has more than 550 million speakers in India alone, of whom 422 million are native, 98.2 million are second language speakers, and 31.2 million\nDocument [126] (Title: Languages with official status in India) exact extent to which, and the areas in which, the Union government uses Hindi and English, respectively, is determined by the provisions of the Constitution, the Official Languages Act, 1963, the Official Languages Rules, 1976, and statutory instruments made by the Department of Official Language under these laws. The Indian constitution draws a distinction between the language to be used in Parliamentary proceedings, and the language in which laws are to be made. Parliamentary business, according to the Constitution, may be conducted in either Hindi or English. The use of English in parliamentary proceedings was to be phased out at\nDocument [127] (Title: Languages with official status in India) As a result, the proposal was dropped, and the Act itself was amended in 1967 to provide that the use of English would not be ended until a resolution to that effect was passed by the legislature of every state that had not adopted Hindi as its official language, and by each house of the Indian Parliament. The position was thus that the Union government continues to use English in addition to Hindi for its official purposes as a "subsidiary official language," but is also required to prepare and execute a programme to progressively increase its use of Hindi. The\nDocument [128] (Title: Rise of Dravidian parties to power in Tamil Nadu) Hindi and English, which are members of three different language families and each is written in a different script. Therefore, a three-language formula proposed was seen as a great educational burden imposed on non-Hindi-speaking states. Nehru promised in 1959 that the interests of the non-Hindi speakers would be safeguarded, and so did Lal Bahadur Shastri later, but those promises didn\'t put the fears of non-Hindi speakers to rest. The support on opposition of Hindi as a national language by the education elite was well evident by the early 1960s where DMK, a champion of this cause, controlled corporations of all\nDocument [129] (Title: Anti-Hindi agitations of Tamil Nadu) we had to accept the principle of numerical superiority while selecting our national bird, the choice would have fallen not on the peacock but on the common crow." As the deadline stipulated in Part XVII of the Constitution for switching to Hindi as primary official language approached, the central government stepped up its efforts to spread Hindi\'s official usage. In 1960, compulsory training for Hindi typing and stenography was started. The same year, India\'s president Rajendra Prasad acted on the Pant Committee\'s recommendations and issued orders for preparation of Hindi glossaries, translating procedural literature and legal codes to Hindi, imparting\nDocument [130] (Title: Anti-Hindi agitations of Tamil Nadu) politicians like Jana Sangh\'s founder Syama Prasad Mookerjee, who demanded that Hindi should be made National language. Soon after the Constitution was adopted on 26 January 1950, efforts were made to propagate Hindi for official usage. In 1952, the Ministry of Education launched a voluntary Hindi teaching scheme. On 27 May 1952, use of Hindi was introduced in warrants for judicial appointments. In 1955, in-house Hindi training was started for all ministries and departments of the central government. On 3 December 1955, the government started using Hindi (along with English) for ""specific purposes of the Union"" As provided for by\nDocument [131] (Title: Three-language formula) Three-language formula The three-language formula for language learning was formulated in 1968 by the Ministry of Education of the Government of India in consultation with the states. The formula as enunciated in the 1968 National Policy Resolution which provided for the study of "Hindi, English and modern Indian language (preferably one of the southern languages) in the Hindi speaking states and Hindi, English and the Regional language in the non-Hindi speaking States". The formula was formulated in response to demands from non-Hindi speaking states of the South, such as Karnataka, Andhra Pradesh and mainly Tamil Nadu. Currently, the three language\nDocument [132] (Title: Durgabai Deshmukh) Hindustani (Hindi+Urdu) as the national language of India but also expressed fear about the forceful campaign for Hindi in South India. She proposed a period of fifteen years of status quo to enable all the non-Hindi speakers to adopt and learn Hindi. Born in Rajahmundry, Andhra Pradesh, British India, in the Gummidithala family; Durgabai was married at the age of 8 to her cousin, Subba Rao. She refused to live with him after her maturation, and her father and brother supported her decision. She later quit him to pursue her education. In 1953, she married the then Finance Minister of\nDocument [133] (Title: Uttar Pradesh Legislative Assembly) the Constitution, in their respective Houses. An important legislative measure passed in 1950 was the U. P. Language (Bills and Acts) Act, 1950, which provided that the language to be used in Bills and Acts shall be Hindi in Devanagri script. In 1951, the U. P. Official Language Act was passed adopting Hindi in Devanagri script as the language to be used for official purpose of the State. On 11 August 1950 the Speaker, Purshottam Das Tandon, resigned from his office. On 21 December 1950 the Deputy Speaker, Nafisul Hasan, was elected the Speaker. Hasan replied to the congratulatory speeches\nDocument [134] (Title: Anti-Hindi agitations of Tamil Nadu) Hindustani and Congress wanted to propagate the learning of Hindustani in non-Hindi speaking Provinces of India. The idea of making Hindustani or Hindi the common language, was not acceptable to Periyar, who viewed it as an attempt to make Tamils subordinate to North Indians. The Indian National Congress won the 1937 elections in Madras Presidency. Rajaji became the Chief Minister on 14 July 1937. He was a supporter of propagating Hindi in South India. On 11 August 1937, within a month of coming to power, he announced his intention to introduce Hindi language teaching in secondary schools by issuing a\nDocument [135] (Title: Anti-Hindi agitations of Tamil Nadu) the opposition to Hindi. To allay their fears, Prime Minister Jawaharlal Nehru enacted the Official Languages Act in 1963 to ensure the continuing use of English beyond 1965. The text of the Act did not satisfy the DMK and increased their skepticism that his assurances might not be honoured by future administrations. As the day (26 January 1965) of switching over to Hindi as sole official language approached, the anti-Hindi movement gained momentum in Madras State with increased support from college students. On 25 January, a full-scale riot broke out in the southern city of Madurai, sparked off by a\nDocument [136] (Title: Languages of India) bitterly contested subject in the proceedings of the Constituent Assembly of India, specifically what should be the language in which the Constitution of India is written and the language spoken during the proceedings of Parliament and thus deserving of the epithet "national". Members belonging to the northern parts of India insisted that the Constitution be drafted in Hindi with the unofficial translation in English. This was not agreed to by the drafting Committee on the grounds that English was much better to craft the nuanced prose on constitutional subjects. The efforts to make Hindi the pre-eminent language were bitterly resisted\nDocument [137] (Title: Anti-Hindi agitations of Tamil Nadu) February 1959. It recommended that Hindi should be made the primary official language with English as the subsidiary one. The Kher Commission and the Pant Committee recommendations were condemned and opposed by self-described "non-Hindi" politicians like Frank Anthony and P. Subbarayan. The Academy of Telugu opposed the switch from English to Hindi in a convention held in 1956. Rajaji, once a staunch supporter of Hindi, organised an All India Language Conference (attended by representatives of Tamil, Malayalam, Telugu, Assamese, Oriya, Marathi, Kannada and Bengali languages) on 8 March 1958 to oppose the switch, declaring [that] "Hindi is as much foreign\nDocument [138] (Title: Anti-Hindi agitation of 1937–40) why Hindi after all has been chosen as the common language of India". Financial support for the agitation was provided by industrialists and landlords like W. P. A. Soundarapandian Nadar and G. D. Naidu. Acknowledging the agitation\'s popular support, Lord Erskine, the then Governor of Madras wrote to Viceroy Linlithgow in July 1938 that "Compulsory Hindi has been the cause of great trouble in this province and is certainly contrary to the wishes of the bulk of the population..." On 1 May 1938, a young man named Stalin Jagadeesan went on a fast demanding the withdrawal of compulsory Hindi teaching.\n\nQuestion: when was hindi made national language of india?'}, {'from': 'gpt', 'value': 'Answer: 1950\nLong Answer: The Indian constitution, in 1950, declared Hindi in Devanagari script to be the official language of the union . Unless Parliament decided otherwise, the use of English for official purposes was to cease 15 years after the constitution came into effect, i.e., on 26 January 1965 . The prospect of the changeover, however, led to much alarm in the non Hindi - speaking areas of India, especially Dravidian - speaking states whose languages were not related to Hindi at all . As a result, Parliament enacted the Official Languages Act, 1963, which provided for the continued use of English for official purposes along with Hindi, even after 1965.\nGold Document ID: 133'}], [{'from': 'human', 'value': '<video>\nWhat actions do the people in the video perform?'}, {'from': 'gpt', 'value': 'The people in the video hug, kiss, dance, and take pictures using a cell phone.'}], [{'from': 'human', 'value': '<video>\nWhat is the ring in the video designed to look like?'}, {'from': 'gpt', 'value': 'The ring in the video appears to be designed in the shape of a treble clef, which is commonly associated with music notation.'}], [{'from': 'user', 'value': 'Can you provide a stepwise solution for evaluating 5973629751327+84285945784?'}, {'from': 'assistant', 'value': '5973629751327 + 84285945784 = 6057915697111'}], [{'from': 'human', 'value': '<image>\nPlease convert the visual information in the image into a detailed written explanation.'}, {'from': 'gpt', 'value': 'The image displays a vibrant neon sign that reads "Hoskins Restaurant Good Food." The sign is designed with a combination of blue, red, and white neon lights. The words "Hoskins" and "Good Food" are in blue, while "Restaurant" is in red. The sign has a cross shape with the longer vertical section formed by the word "Restaurant," and the horizontal section is split between "Hoskins" on the top and "Good Food" on the bottom. The neon sign appears to be photographed with a long exposure or a camera motion technique, as indicated by the light trails that streak horizontally across and behind the sign. These light trails are predominantly red, suggesting the presence of moving vehicles or other light sources during the capture of the image. The background is dark, emphasizing the luminosity and colors of the neon sign.'}], [{'from': 'human', 'value': '<image>\nyou are given some question and answer pairs as context, and you will answer the question at the end based on the image.\n\nIs there a large sphere that has the same color as the tiny cube? no\nDo the red cylinder and the green object have the same size? no\nWhat size is the cyan metal thing left of the large metallic sphere on the left side of the big green ball? small\nWhat size is the metal thing that is to the left of the gray ball and behind the cyan metallic thing? small\nHow many red things have the same size as the red metal cube? 1\nHow many rubber things are large gray balls or small cylinders? 0\nWhat size is the shiny object that is the same color as the tiny metallic block? small\nHow many objects are either green cylinders or big metallic things behind the green metal object? 1\n\nThere is a big object that is in front of the big shiny thing behind the large green object; what is its material?'}, {'from': 'gpt', 'value': 'metal'}], [{'from': 'human', 'value': '<video>\nWatch the provided video and offer an accurate response to the related question.\nAt what time of day do the trees appear to be blooming?'}, {'from': 'gpt', 'value': 'The trees in the video seem to be blooming during the time of bright sunset sunlight.'}], [{'from': 'human', 'value': '<video>\nWhere is this video taken?\nAnswer the question using few words or phrase.'}, {'from': 'gpt', 'value': 'bedroom.'}], [{'from': 'human', 'value': '<video>\nWhat happens after the car lands back on the ramp?'}, {'from': 'gpt', 'value': "Orange text appears, reading 'yesterday,' followed by 'that said.'"}, {'from': 'human', 'value': 'What is the background when the car is flipping in the air?'}, {'from': 'gpt', 'value': 'The background shows a vast blue sky and ocean.'}, {'from': 'human', 'value': 'Why does the car start to flip in the air?'}, {'from': 'gpt', 'value': 'The car starts to flip in the air as it approaches the large jump.'}, {'from': 'human', 'value': 'Where does the video take place?'}, {'from': 'gpt', 'value': 'The video takes place on a steep ramp with red and white striped edges, set against the backdrop of a vast blue sky and ocean.'}, {'from': 'human', 'value': 'What does the animated character at the end of the video look like?'}, {'from': 'gpt', 'value': 'The animated character resembles a yellow sponge wearing headphones and holding a game controller.'}]], 'images': [None, None, None, None, None, ['/data/data/LMM/FreedomIntelligence/ALLaVA-4V/allava_laion/images/40636855.jpeg'], ['datasets/LMM/lmms-lab/LLaVA-OneVision-Data/images/vision_flan(filtered)/15fc7e4a0d37f18204e3b1396b36e1f9.png'], None, None, None], 'source': ['datasets/LLM/jsons/teknium/OpenHermes-2.5/openhermes2_5.jsonl', 'datasets/LLM/jsons/togethercomputer/Long-Data-Collections/natural_questions_10_200_docs.jsonl', 'datasets/LMM/lcvlm_jsonl/MBZUAI/VideoGPT-plus_Training_Dataset/conversation_videochat1.jsonl', 'datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-Video-178K/llava_hound/sharegptvideo_qa_255k_processed.jsonl', 'datasets/LLM/jsons/tiedong/goat/dataset.jsonl', 'datasets/LMM/lcvlm_jsonl/FreedomIntelligence/ALLaVA-4V/allava_laion/ALLaVA-Caption-LAION-4V.jsonl', 'datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-OneVision-Data/vision_flan(filtered).jsonl', 'datasets/LMM/lcvlm_jsonl/MBZUAI/VideoGPT-plus_Training_Dataset/vqa_webvid_qa.jsonl', 'datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-Video-178K/1_2_m_nextqa/1_2_m_nextqa_oe_qa_processed.jsonl', 'datasets/LMM/lcvlm_jsonl/lmms-lab/LLaVA-Video-178K/0_30_s_youtube_v0_1/0_30_s_youtube_oe_v0_1_qa_processed.jsonl'], 'videos': [None, None, ['datasets/LMM/MBZUAI/VideoGPT-plus_Training_Dataset/instruction_tuning/videochat_it/34750672.mp4'], ['datasets/LMM/ShareGPTVideo/train_video_and_instruction/train_300k/44oTaaLMvcI'], None, None, None, ['datasets/LMM/MBZUAI/VideoGPT-plus_Training_Dataset/instruction_tuning/webvid/1010188088.mp4'], ['datasets/LMM/lmms-lab/LLaVA-Video-178K/NextQA/NExTVideo/0057/3174267702.mp4'], ['datasets/LMM/lmms-lab/LLaVA-Video-178K/liwei_youtube_videos/videos/youtube_video_2024/ytb_dU592AZWdX4.mp4']]}
> finished creating GPT datasets ...
[after dataloaders are built] datetime: 2024-11-27 12:50:16 
done with setup ...
training ...
[before the start of training step] datetime: 2024-11-27 12:50:16 
[h264 @ 0x55ef0bbfcdc0] mmco: unref short failure
[h264 @ 0x563d7d988380] mmco: unref short failure
[h264 @ 0x563d7a81ea40] mmco: unref short failure
[h264 @ 0x55ef0bbfcdc0] mmco: unref short failure
[h264 @ 0x55ef0bbfcdc0] mmco: unref short failure
[h264 @ 0x563d7a81ea40] mmco: unref short failure
[h264 @ 0x563d7d988380] mmco: unref short failure
[h264 @ 0x563d7d988380] mmco: unref short failure
[h264 @ 0x55ef0a4ff100] mmco: unref short failure
[h264 @ 0x55ef0a4ff100] mmco: unref short failure
[h264 @ 0x563d7d988380] mmco: unref short failure
[h264 @ 0x563d7d988380] mmco: unref short failure
[h264 @ 0x55ef0a4ff100] mmco: unref short failure
[h264 @ 0x55ef0a4ff100] mmco: unref short failure
[h264 @ 0x55ef0a4ff100] mmco: unref short failure
[h264 @ 0x563d80059280] mmco: unref short failure
[h264 @ 0x563d7d988380] mmco: unref short failure
[h264 @ 0x55ef0a4ff100] mmco: unref short failure
[h264 @ 0x563d7d988380] mmco: unref short failure
[h264 @ 0x55ef0ac5e940] mmco: unref short failure
[h264 @ 0x55ef0a4ff100] mmco: unref short failure
[h264 @ 0x55ef0a4ff100] mmco: unref short failure
[h264 @ 0x563d7d988380] mmco: unref short failure
[h264 @ 0x563d7d988380] mmco: unref short failure
[h264 @ 0x55ef0a4ff100] mmco: unref short failure
[h264 @ 0x563d7d988380] mmco: unref short failure
[h264 @ 0x55ef0a4ff100] mmco: unref short failure
[h264 @ 0x55ef0a4ff100] mmco: unref short failure
[h264 @ 0x563d7d988380] mmco: unref short failure
[h264 @ 0x563d7d988380] mmco: unref short failure
[h264 @ 0x55ef0a4ff100] mmco: unref short failure
[h264 @ 0x55ef0a4ff100] mmco: unref short failure
[h264 @ 0x563d7d988380] mmco: unref short failure
[h264 @ 0x563d7d988380] mmco: unref short failure
[h264 @ 0x55ef0a4ff100] mmco: unref short failure
[h264 @ 0x563d7d988380] mmco: unref short failure
[h264 @ 0x563d8113fbc0] mmco: unref short failure
[h264 @ 0x563d8113fbc0] mmco: unref short failure
[h264 @ 0x55ef0a4ff100] mmco: unref short failure
[h264 @ 0x55ef0a4ff100] mmco: unref short failure
[h264 @ 0x563d8113fbc0] mmco: unref short failure
[h264 @ 0x563d8113fbc0] mmco: unref short failure
[h264 @ 0x55ef0a4ff100] mmco: unref short failure
[h264 @ 0x55ef0a4ff100] mmco: unref short failure
[h264 @ 0x563d7af6db40] mmco: unref short failure
[h264 @ 0x563d7af6db40] mmco: unref short failure
[h264 @ 0x55ef08efb740] mmco: unref short failure
[h264 @ 0x55ef08efb740] mmco: unref short failure
[h264 @ 0x55ef0a4ff100] mmco: unref short failure
[h264 @ 0x563d7d988380] mmco: unref short failure
[h264 @ 0x55ef0a4ff100] mmco: unref short failure
[h264 @ 0x55ef0a4ff100] mmco: unref short failure
[h264 @ 0x563d7d988380] mmco: unref short failure
[h264 @ 0x563d7d988380] mmco: unref short failure
[h264 @ 0x55ef0a4ff100] mmco: unref short failure
[h264 @ 0x55ef0a4ff100] mmco: unref short failure
[h264 @ 0x563d7d988380] mmco: unref short failure
[h264 @ 0x563d7d988380] mmco: unref short failure
[h264 @ 0x563d7d988380] mmco: unref short failure
[h264 @ 0x563d7d988380] mmco: unref short failure
[h264 @ 0x55ef0a4ff100] mmco: unref short failure
[h264 @ 0x55ef0a4ff100] mmco: unref short failure
[h264 @ 0x563d7d988380] mmco: unref short failure
[h264 @ 0x563d7d988380] mmco: unref short failure
[h264 @ 0x55ef0a4ff100] mmco: unref short failure
[h264 @ 0x55ef0a4ff100] mmco: unref short failure
[h264 @ 0x55ef0a4ff100] mmco: unref short failure
[h264 @ 0x55ef0a4ff100] mmco: unref short failure
[h264 @ 0x563d7d988380] mmco: unref short failure
[h264 @ 0x563d7d988380] mmco: unref short failure
/root/miniconda3/envs/py38/lib/python3.8/site-packages/torch_npu/utils/storage.py:38: UserWarning: TypedStorage is deprecated. It will be removed in the future and UntypedStorage will be the only storage class. This should only matter to you if you are using storages directly.  To access UntypedStorage directly, use tensor.untyped_storage() instead of tensor.storage()
  if self.device.type != 'cpu':
[h264 @ 0x563d8113fbc0] mmco: unref short failure
[h264 @ 0x563d8113fbc0] mmco: unref short failure
[h264 @ 0x55ef0a4ff100] mmco: unref short failure
[h264 @ 0x55ef0a4ff100] mmco: unref short failure
[W ArgSortKernelNpuOpApi.cpp:26] Warning: Warning: kernel [ArgSort] can not support dtype int32 or int64 on AiCore, Now this kernel is running on AiCpu.If you are more concerned about high-performance execution,please cast dtype to float32. (function operator())
[W ToKernelNpu.cpp:137] Warning: Warning: Device do not support double dtype now, dtype cast repalce with float. (function operator())
[h264 @ 0x55ef0a4ff100] mmco: unref short failure
[h264 @ 0x55ef0a4ff100] mmco: unref short failure
[h264 @ 0x563d8113fbc0] mmco: unref short failure
[h264 @ 0x563d8113fbc0] mmco: unref short failure
WARNING:megatron.core.models.common.embeddings.rotary_pos_embedding:Setting apply_rope_fusion to false because its implementation is not included in Apex. Try upgrading to the latest version
................................................................................[h264 @ 0x55ef08bb1f80] mmco: unref short failure
[h264 @ 0x55ef08bb1f80] mmco: unref short failure
[h264 @ 0x563d7d988380] mmco: unref short failure
[h264 @ 0x563d7d988380] mmco: unref short failure
/root/miniconda3/envs/py38/lib/python3.8/site-packages/torch_npu/utils/storage.py:38: UserWarning: TypedStorage is deprecated. It will be removed in the future and UntypedStorage will be the only storage class. This should only matter to you if you are using storages directly.  To access UntypedStorage directly, use tensor.untyped_storage() instead of tensor.storage()
  if self.device.type != 'cpu':
[h264 @ 0x563d8113fbc0] mmco: unref short failure
[h264 @ 0x55ef0a4ff100] mmco: unref short failure
[h264 @ 0x563d8113fbc0] mmco: unref short failure
[h264 @ 0x563d8113fbc0] mmco: unref short failure
[h264 @ 0x55ef0a4ff100] mmco: unref short failure
[h264 @ 0x55ef0a4ff100] mmco: unref short failure
[h264 @ 0x563d8113fbc0] mmco: unref short failure
[h264 @ 0x563d8113fbc0] mmco: unref short failure
[h264 @ 0x55ef0a4ff100] mmco: unref short failure
[h264 @ 0x55ef0a4ff100] mmco: unref short failure
[h264 @ 0x563d8113fbc0] mmco: unref short failure
[h264 @ 0x563d8113fbc0] mmco: unref short failure
[h264 @ 0x55ef0a4ff100] mmco: unref short failure
[h264 @ 0x563d8113fbc0] mmco: unref short failure
[h264 @ 0x55ef0a4ff100] mmco: unref short failure
[h264 @ 0x55ef0a4ff100] mmco: unref short failure
................[Rank 13] (after 1 iterations) memory (MB) | allocated: 15231.59814453125 | max allocated: 32800.24853515625 | reserved: 38598.0 | max reserved: 38598.0[Rank 8] (after 1 iterations) memory (MB) | allocated: 15231.59814453125 | max allocated: 32800.24853515625 | reserved: 37942.0 | max reserved: 37942.0

[Rank 12] (after 1 iterations) memory (MB) | allocated: 15231.59814453125 | max allocated: 32800.24853515625 | reserved: 38598.0 | max reserved: 38598.0
[Rank 5] (after 1 iterations) memory (MB) | allocated: 14401.755859375 | max allocated: 31714.333984375 | reserved: 35570.0 | max reserved: 35570.0
[Rank 14] (after 1 iterations) memory (MB) | allocated: 15231.59814453125 | max allocated: 32800.24853515625 | reserved: 38598.0 | max reserved: 38598.0[Rank 15] (after 1 iterations) memory (MB) | allocated: 15231.59814453125 | max allocated: 32800.24853515625 | reserved: 38598.0 | max reserved: 38598.0

[Rank 9] (after 1 iterations) memory (MB) | allocated: 15231.59814453125 | max allocated: 32800.24853515625 | reserved: 38598.0 | max reserved: 38598.0
[Rank 11] (after 1 iterations) memory (MB) | allocated: 15231.59814453125 | max allocated: 32800.24853515625 | reserved: 38598.0 | max reserved: 38598.0[Rank 3] (after 1 iterations) memory (MB) | allocated: 14401.755859375 | max allocated: 31714.333984375 | reserved: 35570.0 | max reserved: 35570.0[Rank 1] (after 1 iterations) memory (MB) | allocated: 14401.755859375 | max allocated: 31714.333984375 | reserved: 35570.0 | max reserved: 35570.0


[Rank 10] (after 1 iterations) memory (MB) | allocated: 15231.59814453125 | max allocated: 32800.24853515625 | reserved: 38598.0 | max reserved: 38598.0[Rank 6] (after 1 iterations) memory (MB) | allocated: 14401.755859375 | max allocated: 31714.333984375 | reserved: 35570.0 | max reserved: 35570.0

[Rank 7] (after 1 iterations) memory (MB) | allocated: 14401.755859375 | max allocated: 31714.333984375 | reserved: 35570.0 | max reserved: 35570.0
[Rank 4] (after 1 iterations) memory (MB) | allocated: 14401.755859375 | max allocated: 31714.333984375 | reserved: 35570.0 | max reserved: 35570.0[Rank 0] (after 1 iterations) memory (MB) | allocated: 14401.755859375 | max allocated: 31714.333984375 | reserved: 35578.0 | max reserved: 35578.0

[Rank 2] (after 1 iterations) memory (MB) | allocated: 14401.755859375 | max allocated: 31714.333984375 | reserved: 35570.0 | max reserved: 35570.0
[h264 @ 0x563d7d988380] mmco: unref short failure
[h264 @ 0x55ef0c609180] mmco: unref short failure
[h264 @ 0x55ef0c609180] mmco: unref short failure
[h264 @ 0x563d7d988380] mmco: unref short failure
[h264 @ 0x55ef0c609180] mmco: unref short failure
[h264 @ 0x563d7d988380] mmco: unref short failure
[h264 @ 0x55ef0c609180] mmco: unref short failure
[h264 @ 0x563d7d988380] mmco: unref short failure
/root/miniconda3/envs/py38/lib/python3.8/site-packages/torch_npu/utils/storage.py:38: UserWarning: TypedStorage is deprecated. It will be removed in the future and UntypedStorage will be the only storage class. This should only matter to you if you are using storages directly.  To access UntypedStorage directly, use tensor.untyped_storage() instead of tensor.storage()
  if self.device.type != 'cpu':
/root/miniconda3/envs/py38/lib/python3.8/site-packages/torch_npu/utils/storage.py:38: UserWarning: TypedStorage is deprecated. It will be removed in the future and UntypedStorage will be the only storage class. This should only matter to you if you are using storages directly.  To access UntypedStorage directly, use tensor.untyped_storage() instead of tensor.storage()
  if self.device.type != 'cpu':
[h264 @ 0x55ef0c609180] mmco: unref short failure
[h264 @ 0x55ef0c609180] mmco: unref short failure
[h264 @ 0x563d8113fbc0] mmco: unref short failure
[h264 @ 0x563d8113fbc0] mmco: unref short failure
[h264 @ 0x55ef0a4ff100] mmco: unref short failure
[h264 @ 0x55ef0a4ff100] mmco: unref short failure
[h264 @ 0x563d8113fbc0] mmco: unref short failure
[h264 @ 0x563d8113fbc0] mmco: unref short failure
[h264 @ 0x55ef0a4ff100] mmco: unref short failure
[h264 @ 0x55ef0a4ff100] mmco: unref short failure
[h264 @ 0x563d8113fbc0] mmco: unref short failure
[h264 @ 0x563d8113fbc0] mmco: unref short failure
[h264 @ 0x55ef09310680] mmco: unref short failure
[h264 @ 0x563d8113fbc0] mmco: unref short failure
[h264 @ 0x55ef09310680] mmco: unref short failure
[h264 @ 0x55ef09310680] mmco: unref short failure
[h264 @ 0x55ef09310680] mmco: unref short failure
[h264 @ 0x55ef09310680] mmco: unref short failure
[h264 @ 0x563d8113fbc0] mmco: unref short failure
[h264 @ 0x563d8113fbc0] mmco: unref short failure
[h264 @ 0x563d8113fbc0] mmco: unref short failure
[h264 @ 0x563d8113fbc0] mmco: unref short failure
[h264 @ 0x55ef09310680] mmco: unref short failure
[h264 @ 0x55ef09310680] mmco: unref short failure
[h264 @ 0x563d8113fbc0] mmco: unref short failure
[h264 @ 0x563d8113fbc0] mmco: unref short failure
[h264 @ 0x55ef09310680] mmco: unref short failure
[h264 @ 0x563d8113fbc0] mmco: unref short failure
/root/miniconda3/envs/py38/lib/python3.8/site-packages/torch_npu/utils/storage.py:38: UserWarning: TypedStorage is deprecated. It will be removed in the future and UntypedStorage will be the only storage class. This should only matter to you if you are using storages directly.  To access UntypedStorage directly, use tensor.untyped_storage() instead of tensor.storage()
  if self.device.type != 'cpu':
[h264 @ 0x563d7aa99bc0] mmco: unref short failure
[h264 @ 0x55ef08efb740] mmco: unref short failure
/root/miniconda3/envs/py38/lib/python3.8/site-packages/torch_npu/utils/storage.py:38: UserWarning: TypedStorage is deprecated. It will be removed in the future and UntypedStorage will be the only storage class. This should only matter to you if you are using storages directly.  To access UntypedStorage directly, use tensor.untyped_storage() instead of tensor.storage()
  if self.device.type != 'cpu':
[h264 @ 0x563d79c29240] mmco: unref short failure
[h264 @ 0x563d79c29240] mmco: unref short failure
[h264 @ 0x55ef0abb24c0] mmco: unref short failure
[h264 @ 0x55ef0abb24c0] mmco: unref short failure
[h264 @ 0x55ef08efb740] mmco: unref short failure
[h264 @ 0x563d7aa99bc0] mmco: unref short failure
[h264 @ 0x563d8113fbc0] mmco: unref short failure
[h264 @ 0x563d8113fbc0] mmco: unref short failure
[h264 @ 0x55ef0a4ff100] mmco: unref short failure
[h264 @ 0x55ef0a4ff100] mmco: unref short failure
[h264 @ 0x563d8113fbc0] mmco: unref short failure
[h264 @ 0x55ef0a4ff100] mmco: unref short failure
/root/miniconda3/envs/py38/lib/python3.8/site-packages/torch_npu/utils/storage.py:38: UserWarning: TypedStorage is deprecated. It will be removed in the future and UntypedStorage will be the only storage class. This should only matter to you if you are using storages directly.  To access UntypedStorage directly, use tensor.untyped_storage() instead of tensor.storage()
  if self.device.type != 'cpu':
[h264 @ 0x563d8113fbc0] mmco: unref short failure
[h264 @ 0x563d8113fbc0] mmco: unref short failure
[h264 @ 0x55ef0a4ff100] mmco: unref short failure
[h264 @ 0x55ef0a4ff100] mmco: unref short failure
[h264 @ 0x55ef0a4ff100] mmco: unref short failure
[h264 @ 0x563d8113fbc0] mmco: unref short failure
[h264 @ 0x55ef0a4ff100] mmco: unref short failure
[h264 @ 0x55ef0a4ff100] mmco: unref short failure
[h264 @ 0x563d8113fbc0] mmco: unref short failure
[h264 @ 0x563d8113fbc0] mmco: unref short failure
[h264 @ 0x55ef0a4ff100] mmco: unref short failure
[h264 @ 0x563d8113fbc0] mmco: unref short failure
[h264 @ 0x55ef09310680] mmco: unref short failure
[h264 @ 0x55ef09310680] mmco: unref short failure
[h264 @ 0x563d8113fbc0] mmco: unref short failure
[h264 @ 0x563d8113fbc0] mmco: unref short failure
[h264 @ 0x55ef09310680] mmco: unref short failure
[h264 @ 0x55ef09310680] mmco: unref short failure
[h264 @ 0x563d8113fbc0] mmco: unref short failure
[h264 @ 0x563d8113fbc0] mmco: unref short failure
[h264 @ 0x55ef09310680] mmco: unref short failure
[h264 @ 0x55ef09310680] mmco: unref short failure
[h264 @ 0x563d8113fbc0] mmco: unref short failure
[h264 @ 0x563d8113fbc0] mmco: unref short failure
/root/miniconda3/envs/py38/lib/python3.8/site-packages/torch_npu/utils/storage.py:38: UserWarning: TypedStorage is deprecated. It will be removed in the future and UntypedStorage will be the only storage class. This should only matter to you if you are using storages directly.  To access UntypedStorage directly, use tensor.untyped_storage() instead of tensor.storage()
  if self.device.type != 'cpu':
[h264 @ 0x563d8113fbc0] mmco: unref short failure
[h264 @ 0x563d8113fbc0] mmco: unref short failure
[h264 @ 0x55ef0a4ff100] mmco: unref short failure
[h264 @ 0x55ef0a4ff100] mmco: unref short failure
[h264 @ 0x563d7d988380] mmco: unref short failure
[h264 @ 0x55ef0abb24c0] mmco: unref short failure
[h264 @ 0x563d7d988380] mmco: unref short failure
[h264 @ 0x563d7d988380] mmco: unref short failure
[h264 @ 0x55ef0abb24c0] mmco: unref short failure
[h264 @ 0x55ef0abb24c0] mmco: unref short failure
[h264 @ 0x55ef0a4ff100] mmco: unref short failure
[h264 @ 0x563d78765400] mmco: unref short failure
[h264 @ 0x563d8113fbc0] mmco: unref short failure
[h264 @ 0x55ef09310680] mmco: unref short failure
[h264 @ 0x563d78765400] mmco: unref short failure
[h264 @ 0x563d78765400] mmco: unref short failure
[h264 @ 0x55ef09310680] mmco: unref short failure
[h264 @ 0x55ef09310680] mmco: unref short failure
[h264 @ 0x563d78765400] mmco: unref short failure
[h264 @ 0x563d78765400] mmco: unref short failure
[h264 @ 0x55ef09310680] mmco: unref short failure
[h264 @ 0x55ef09310680] mmco: unref short failure
[h264 @ 0x55ef09310680] mmco: unref short failure
[h264 @ 0x55ef09310680] mmco: unref short failure
[h264 @ 0x563d78765400] mmco: unref short failure
[h264 @ 0x563d78765400] mmco: unref short failure
[h264 @ 0x563d79c29240] mmco: unref short failure
[h264 @ 0x563d79c29240] mmco: unref short failure
[h264 @ 0x55ef0abb24c0] mmco: unref short failure
[h264 @ 0x55ef0abb24c0] mmco: unref short failure
[h264 @ 0x55ef09310680] mmco: unref short failure
[h264 @ 0x55ef09310680] mmco: unref short failure
[h264 @ 0x563d78765400] mmco: unref short failure
[h264 @ 0x563d78765400] mmco: unref short failure
[h264 @ 0x563d8113fbc0] mmco: unref short failure
[h264 @ 0x563d8113fbc0] mmco: unref short failure
[h264 @ 0x55ef09310680] mmco: unref short failure
[h264 @ 0x55ef09310680] mmco: unref short failure
[h264 @ 0x55ef09310680] mmco: unref short failure
[h264 @ 0x563d78765400] mmco: unref short failure
[h264 @ 0x563d78765400] mmco: unref short failure
[h264 @ 0x563d78765400] mmco: unref short failure
[h264 @ 0x563d8113fbc0] mmco: unref short failure
[h264 @ 0x55ef09310680] mmco: unref short failure
[h264 @ 0x55ef09310680] mmco: unref short failure
[h264 @ 0x55ef09310680] mmco: unref short failure
[h264 @ 0x563d78765400] mmco: unref short failure
[h264 @ 0x563d78765400] mmco: unref short failure
[h264 @ 0x55ef09310680] mmco: unref short failure
[h264 @ 0x55ef09310680] mmco: unref short failure
[h264 @ 0x563d78765400] mmco: unref short failure
[h264 @ 0x55ef09310680] mmco: unref short failure
[h264 @ 0x55ef09310680] mmco: unref short failure
[h264 @ 0x55ef09310680] mmco: unref short failure
[h264 @ 0x563d8113fbc0] mmco: unref short failure
[h264 @ 0x563d8113fbc0] mmco: unref short failure
[h264 @ 0x563d78765400] mmco: unref short failure
[h264 @ 0x55ef09310680] mmco: unref short failure
[h264 @ 0x563d79c29240] mmco: unref short failure
[h264 @ 0x563d79c29240] mmco: unref short failure
[h264 @ 0x55ef0abb24c0] mmco: unref short failure
[h264 @ 0x55ef0abb24c0] mmco: unref short failure
[h264 @ 0x563d7a499000] mmco: unref short failure
[h264 @ 0x55ef09fb8c00] mmco: unref short failure
[h264 @ 0x55ef09fb8c00] mmco: unref short failure
[h264 @ 0x55ef09fb8c00] mmco: unref short failure
[h264 @ 0x563d7a499000] mmco: unref short failure
[h264 @ 0x563d7a499000] mmco: unref short failure
[h264 @ 0x55ef0c609180] mmco: unref short failure
[h264 @ 0x55ef0c609180] mmco: unref short failure
[h264 @ 0x563d7d988380] mmco: unref short failure
[h264 @ 0x563d7d988380] mmco: unref short failure
[h264 @ 0x55ef0c609180] mmco: unref short failure
[h264 @ 0x55ef0c609180] mmco: unref short failure
[h264 @ 0x563d79c29240] mmco: unref short failure
[h264 @ 0x563d79c29240] mmco: unref short failure
[h264 @ 0x55ef0c609180] mmco: unref short failure
[h264 @ 0x55ef0c609180] mmco: unref short failure
[h264 @ 0x563d79c29240] mmco: unref short failure
[h264 @ 0x563d79c29240] mmco: unref short failure
[h264 @ 0x55ef0abb24c0] mmco: unref short failure
[h264 @ 0x55ef0abb24c0] mmco: unref short failure
[h264 @ 0x563d7d988380] mmco: unref short failure
[h264 @ 0x563d7d988380] mmco: unref short failure
[h264 @ 0x55ef0c609180] mmco: unref short failure
[h264 @ 0x55ef0c609180] mmco: unref short failure
[h264 @ 0x563d79c29240] mmco: unref short failure
[h264 @ 0x563d79c29240] mmco: unref short failure
[h264 @ 0x563d8113fbc0] mmco: unref short failure
[h264 @ 0x55ef0a4ff100] mmco: unref short failure
[h264 @ 0x55ef0c609180] mmco: unref short failure
[h264 @ 0x563d7d988380] mmco: unref short failure
[h264 @ 0x563d8113fbc0] mmco: unref short failure
[h264 @ 0x563d8113fbc0] mmco: unref short failure
[h264 @ 0x55ef0a4ff100] mmco: unref short failure
[h264 @ 0x55ef0a4ff100] mmco: unref short failure
[h264 @ 0x563d8113fbc0] mmco: unref short failure
[h264 @ 0x563d8113fbc0] mmco: unref short failure
[h264 @ 0x55ef0a4ff100] mmco: unref short failure
[h264 @ 0x55ef0a4ff100] mmco: unref short failure
[h264 @ 0x563d8113fbc0] mmco: unref short failure
[h264 @ 0x55ef0a4ff100] mmco: unref short failure
[h264 @ 0x563d8113fbc0] mmco: unref short failure
[h264 @ 0x563d8113fbc0] mmco: unref short failure
[h264 @ 0x55ef0a4ff100] mmco: unref short failure
[h264 @ 0x55ef0a4ff100] mmco: unref short failure
[h264 @ 0x55ef0c609180] mmco: unref short failure
[h264 @ 0x563d7d988380] mmco: unref short failure
[h264 @ 0x55ef0c609180] mmco: unref short failure
[h264 @ 0x55ef0c609180] mmco: unref short failure
[h264 @ 0x563d7d988380] mmco: unref short failure
[h264 @ 0x563d7d988380] mmco: unref short failure
[h264 @ 0x55ef08efb740] mmco: unref short failure
[h264 @ 0x55ef08efb740] mmco: unref short failure
[h264 @ 0x563d7d988380] mmco: unref short failure
[h264 @ 0x563d7d988380] mmco: unref short failure
[h264 @ 0x55ef0c609180] mmco: unref short failure
[h264 @ 0x55ef0c609180] mmco: unref short failure
[h264 @ 0x55ef0c609180] mmco: unref short failure
[h264 @ 0x55ef0c609180] mmco: unref short failure
[h264 @ 0x563d7d988380] mmco: unref short failure
[h264 @ 0x563d7d988380] mmco: unref short failure
[h264 @ 0x563d7d988380] mmco: unref short failure
[h264 @ 0x563d7d988380] mmco: unref short failure
[h264 @ 0x55ef0c609180] mmco: unref short failure
[h264 @ 0x563d7d988380] mmco: unref short failure
[h264 @ 0x55ef0c609180] mmco: unref short failure
[h264 @ 0x563d7d988380] mmco: unref short failure
[h264 @ 0x55ef0a4ff100] mmco: unref short failure
[h264 @ 0x563d8113fbc0] mmco: unref short failure
[h264 @ 0x563d7de7dd00] mmco: unref short failure
[h264 @ 0x55ef0aadf440] mmco: unref short failure
[h264 @ 0x563d7a81ea40] mmco: unref short failure
[h264 @ 0x55ef0c609180] mmco: unref short failure
[h264 @ 0x563d7a81ea40] mmco: unref short failure
[h264 @ 0x563d7a81ea40] mmco: unref short failure
[h264 @ 0x55ef0c609180] mmco: unref short failure
[h264 @ 0x55ef0c609180] mmco: unref short failure
[h264 @ 0x563d7de7dd00] mmco: unref short failure
[h264 @ 0x55ef0aadf440] mmco: unref short failure
[h264 @ 0x563d7de7dd00] mmco: unref short failure
[h264 @ 0x55ef0aadf440] mmco: unref short failure
[h264 @ 0x563d7de7dd00] mmco: unref short failure
[h264 @ 0x563d7de7dd00] mmco: unref short failure
[h264 @ 0x55ef0aadf440] mmco: unref short failure
[h264 @ 0x55ef0aadf440] mmco: unref short failure
[h264 @ 0x563d7de7dd00] mmco: unref short failure
[h264 @ 0x563d7de7dd00] mmco: unref short failure
[h264 @ 0x55ef0aadf440] mmco: unref short failure
[h264 @ 0x55ef0aadf440] mmco: unref short failure
[h264 @ 0x563d7a81ea40] mmco: unref short failure
[h264 @ 0x55ef0c609180] mmco: unref short failure
[h264 @ 0x55ef0c609180] mmco: unref short failure
[h264 @ 0x55ef0c609180] mmco: unref short failure
[h264 @ 0x563d7a2328c0] mmco: unref short failure
[h264 @ 0x563d7a2328c0] mmco: unref short failure
[h264 @ 0x55ef0c609180] mmco: unref short failure
[h264 @ 0x563d7a2328c0] mmco: unref short failure
[h264 @ 0x563d79301700] mmco: unref short failure
[h264 @ 0x55ef08bb1f80] mmco: unref short failure
[h264 @ 0x55ef0a4ff100] mmco: unref short failure
[h264 @ 0x55ef0a4ff100] mmco: unref short failure
[h264 @ 0x563d8113fbc0] mmco: unref short failure
[h264 @ 0x563d8113fbc0] mmco: unref short failure
[h264 @ 0x55ef0d4bc780] mmco: unref short failure
[h264 @ 0x55ef0d4bc780] mmco: unref short failure
[h264 @ 0x55ef0d4bc780] mmco: unref short failure
[h264 @ 0x563d7973ac40] mmco: unref short failure
[h264 @ 0x55ef0d4bc780] mmco: unref short failure
[h264 @ 0x563d7973ac40] mmco: unref short failure
[h264 @ 0x563d7973ac40] mmco: unref short failure
[h264 @ 0x55ef0d4bc780] mmco: unref short failure
[h264 @ 0x55ef0d4bc780] mmco: unref short failure
[h264 @ 0x563d7973ac40] mmco: unref short failure
[h264 @ 0x563d7973ac40] mmco: unref short failure
[h264 @ 0x563d7973ac40] mmco: unref short failure
[h264 @ 0x55ef08efb740] mmco: unref short failure
[h264 @ 0x55ef08efb740] mmco: unref short failure
[h264 @ 0x563d7d988380] mmco: unref short failure
[h264 @ 0x563d7d988380] mmco: unref short failure
[h264 @ 0x55ef0a4ff100] mmco: unref short failure
[h264 @ 0x55ef0a4ff100] mmco: unref short failure
[h264 @ 0x563d8113fbc0] mmco: unref short failure
[h264 @ 0x563d8113fbc0] mmco: unref short failure
[h264 @ 0x55ef0abb24c0] mmco: unref short failure
[h264 @ 0x55ef0abb24c0] mmco: unref short failure
[h264 @ 0x563d7d988380] mmco: unref short failure
[h264 @ 0x563d7d988380] mmco: unref short failure
[h264 @ 0x55ef08bb1f80] mmco: unref short failure
[h264 @ 0x55ef08bb1f80] mmco: unref short failure
[h264 @ 0x563d79301700] mmco: unref short failure
[h264 @ 0x563d79301700] mmco: unref short failure
[h264 @ 0x55ef0abb24c0] mmco: unref short failure
[h264 @ 0x55ef0abb24c0] mmco: unref short failure
[h264 @ 0x563d7d988380] mmco: unref short failure
[h264 @ 0x563d7d988380] mmco: unref short failure
[h264 @ 0x55ef0abb24c0] mmco: unref short failure
[h264 @ 0x55ef0abb24c0] mmco: unref short failure
[h264 @ 0x563d7d988380] mmco: unref short failure
[h264 @ 0x563d7d988380] mmco: unref short failure
[h264 @ 0x55ef0abb24c0] mmco: unref short failure
[h264 @ 0x55ef0abb24c0] mmco: unref short failure
[h264 @ 0x563d7d988380] mmco: unref short failure
[h264 @ 0x563d7d988380] mmco: unref short failure
[h264 @ 0x55ef09310680] mmco: unref short failure
[h264 @ 0x55ef09310680] mmco: unref short failure
[h264 @ 0x55ef09310680] mmco: unref short failure
[h264 @ 0x563d78765400] mmco: unref short failure
[h264 @ 0x563d78765400] mmco: unref short failure
[h264 @ 0x563d78765400] mmco: unref short failure
[h264 @ 0x55ef0abb24c0] mmco: unref short failure
[h264 @ 0x563d7d988380] mmco: unref short failure
[h264 @ 0x55ef08efb740] mmco: unref short failure
[h264 @ 0x563d7d988380] mmco: unref short failure
[h264 @ 0x55ef08efb740] mmco: unref short failure
[h264 @ 0x55ef08efb740] mmco: unref short failure
[h264 @ 0x563d7d988380] mmco: unref short failure
[h264 @ 0x563d7d988380] mmco: unref short failure
[h264 @ 0x55ef0c609180] mmco: unref short failure
[h264 @ 0x563d7d988380] mmco: unref short failure
[h264 @ 0x563d79152140] mmco: unref short failure
[h264 @ 0x55ef09fb8c00] mmco: unref short failure
[h264 @ 0x563d7d5f9480] mmco: unref short failure
[h264 @ 0x55ef09310680] mmco: unref short failure
[h264 @ 0x563d7d5f9480] mmco: unref short failure
[h264 @ 0x55ef09310680] mmco: unref short failure
[h264 @ 0x563d7d5f9480] mmco: unref short failure
[h264 @ 0x563d7d5f9480] mmco: unref short failure
[h264 @ 0x55ef09310680] mmco: unref short failure
[h264 @ 0x55ef09310680] mmco: unref short failure
[h264 @ 0x55ef09310680] mmco: unref short failure
[h264 @ 0x55ef09310680] mmco: unref short failure
[h264 @ 0x563d7d988380] mmco: unref short failure
[h264 @ 0x563d7d988380] mmco: unref short failure
[h264 @ 0x55ef0bf94d00] mmco: unref short failure
[h264 @ 0x563d7d988380] mmco: unref short failure
[h264 @ 0x55ef0a40fe40] mmco: unref short failure
[h264 @ 0x55ef0a40fe40] mmco: unref short failure
[h264 @ 0x563d7990bd80] mmco: unref short failure
[h264 @ 0x563d7990bd80] mmco: unref short failure
[h264 @ 0x563d799469c0] mmco: unref short failure
[h264 @ 0x55ef0ac5e940] mmco: unref short failure
[h264 @ 0x55ef0a962d40] mmco: unref short failure
[h264 @ 0x563d7de83040] mmco: unref short failure
[h264 @ 0x55ef0a962d40] mmco: unref short failure
[h264 @ 0x55ef0a962d40] mmco: unref short failure
[h264 @ 0x563d7de83040] mmco: unref short failure
[h264 @ 0x563d7de83040] mmco: unref short failure
[h264 @ 0x55ef0a962d40] mmco: unref short failure
[h264 @ 0x563d7de83040] mmco: unref short failure
[h264 @ 0x55ef10b42940] mmco: unref short failure
[h264 @ 0x55ef10b42940] mmco: unref short failure
[h264 @ 0x563d7a5354c0] mmco: unref short failure
[h264 @ 0x563d7a5354c0] mmco: unref short failure
[h264 @ 0x55ef10b42940] mmco: unref short failure
[h264 @ 0x55ef10b42940] mmco: unref short failure
[h264 @ 0x563d7a5354c0] mmco: unref short failure
[h264 @ 0x563d7a5354c0] mmco: unref short failure
[h264 @ 0x55ef10b42940] mmco: unref short failure
[h264 @ 0x563d7a5354c0] mmco: unref short failure
[h264 @ 0x55ef10b42940] mmco: unref short failure
[h264 @ 0x55ef10b42940] mmco: unref short failure
[h264 @ 0x563d7a5354c0] mmco: unref short failure
[h264 @ 0x563d7a5354c0] mmco: unref short failure
[h264 @ 0x55ef09310680] mmco: unref short failure
[h264 @ 0x55ef09310680] mmco: unref short failure
[h264 @ 0x55ef09310680] mmco: unref short failure
[h264 @ 0x563d78765400] mmco: unref short failure
[h264 @ 0x563d78765400] mmco: unref short failure
[h264 @ 0x563d78765400] mmco: unref short failure
[h264 @ 0x55ef09310680] mmco: unref short failure
[h264 @ 0x55ef09310680] mmco: unref short failure
[h264 @ 0x563d78765400] mmco: unref short failure
[h264 @ 0x563d78765400] mmco: unref short failure
[h264 @ 0x55ef0ac5e940] mmco: unref short failure
[h264 @ 0x55ef0ac5e940] mmco: unref short failure
[h264 @ 0x563d799469c0] mmco: unref short failure
[h264 @ 0x563d799469c0] mmco: unref short failure
[h264 @ 0x55ef0ac5e940] mmco: unref short failure
[h264 @ 0x563d7990bd80] mmco: unref short failure
[h264 @ 0x55ef0ac5e940] mmco: unref short failure
[h264 @ 0x563d7990bd80] mmco: unref short failure
[h264 @ 0x55ef0ac5e940] mmco: unref short failure
[h264 @ 0x55ef0ac5e940] mmco: unref short failure
[h264 @ 0x563d7990bd80] mmco: unref short failure
[h264 @ 0x563d7990bd80] mmco: unref short failure
[h264 @ 0x55ef08da0b00] mmco: unref short failure
[h264 @ 0x563d7a5354c0] mmco: unref short failure
[h264 @ 0x55ef09e96ac0] mmco: unref short failure
[h264 @ 0x55ef09e96ac0] mmco: unref short failure
[h264 @ 0x563d79152140] mmco: unref short failure
[h264 @ 0x563d79152140] mmco: unref short failure
[h264 @ 0x55ef09e96ac0] mmco: unref short failure
[h264 @ 0x563d79152140] mmco: unref short failure
[h264 @ 0x55ef08bb4480] mmco: unref short failure
[h264 @ 0x55ef08bb4480] mmco: unref short failure
[h264 @ 0x563d7945b680] mmco: unref short failure
[h264 @ 0x563d7945b680] mmco: unref short failure
[h264 @ 0x563d79152140] mmco: unref short failure
[h264 @ 0x563d79152140] mmco: unref short failure
[h264 @ 0x55ef09bd1e40] mmco: unref short failure
[h264 @ 0x55ef09bd1e40] mmco: unref short failure
[h264 @ 0x563d7a390940] mmco: unref short failure
[h264 @ 0x55ef08a13500] mmco: unref short failure
[h264 @ 0x563d7a390940] mmco: unref short failure
[h264 @ 0x563d7a390940] mmco: unref short failure
[h264 @ 0x55ef08a13500] mmco: unref short failure
[h264 @ 0x55ef08a13500] mmco: unref short failure
[h264 @ 0x563d79152140] mmco: unref short failure
[h264 @ 0x563d79152140] mmco: unref short failure
[h264 @ 0x55ef09bd1e40] mmco: unref short failure
[h264 @ 0x55ef09bd1e40] mmco: unref short failure
[h264 @ 0x55ef09bd1e40] mmco: unref short failure
[h264 @ 0x55ef09bd1e40] mmco: unref short failure
[h264 @ 0x563d79152140] mmco: unref short failure
[h264 @ 0x563d79152140] mmco: unref short failure
[h264 @ 0x55ef0cc8d540] mmco: unref short failure
[h264 @ 0x55ef0cc8d540] mmco: unref short failure
[h264 @ 0x563d7a649840] mmco: unref short failure
[h264 @ 0x563d7a649840] mmco: unref short failure
[h264 @ 0x563d7aa00740] mmco: unref short failure
[h264 @ 0x563d7aa00740] mmco: unref short failure
[h264 @ 0x55ef0ac5e940] mmco: unref short failure
[h264 @ 0x55ef0ac5e940] mmco: unref short failure
[h264 @ 0x563d7aa00740] mmco: unref short failure
[h264 @ 0x563d793dc200] mmco: unref short failure
[h264 @ 0x563d793dc200] mmco: unref short failure
[h264 @ 0x55ef0ac5e940] mmco: unref short failure
[h264 @ 0x55ef08aa5000] mmco: unref short failure
[h264 @ 0x55ef08aa5000] mmco: unref short failure
[h264 @ 0x563d7aa00740] mmco: unref short failure
[h264 @ 0x55ef0ac5e940] mmco: unref short failure
[h264 @ 0x563d7aa00740] mmco: unref short failure
[h264 @ 0x563d793dc200] mmco: unref short failure
[h264 @ 0x55ef0ac5e940] mmco: unref short failure
[h264 @ 0x55ef08aa5000] mmco: unref short failure
[h264 @ 0x563d793dc200] mmco: unref short failure
[h264 @ 0x55ef08aa5000] mmco: unref short failure
[h264 @ 0x563d79427c00] mmco: unref short failure
[h264 @ 0x563d79427c00] mmco: unref short failure
[h264 @ 0x55ef08f34980] mmco: unref short failure
[h264 @ 0x55ef08f34980] mmco: unref short failure
[h264 @ 0x563d7a5354c0] mmco: unref short failure
[h264 @ 0x563d7a5354c0] mmco: unref short failure
[h264 @ 0x55ef086a4300] mmco: unref short failure
[h264 @ 0x55ef086a4300] mmco: unref short failure
[h264 @ 0x563d793dc200] mmco: unref short failure
[h264 @ 0x563d793dc200] mmco: unref short failure
[h264 @ 0x55ef08aa5000] mmco: unref short failure
[h264 @ 0x55ef08aa5000] mmco: unref short failure
[h264 @ 0x55ef0ac5e940] mmco: unref short failure
[h264 @ 0x55ef0ac5e940] mmco: unref short failure
[h264 @ 0x563d7990bd80] mmco: unref short failure
[h264 @ 0x563d7990bd80] mmco: unref short failure
[h264 @ 0x563d7945b680] mmco: unref short failure
[h264 @ 0x55ef08f34980] mmco: unref short failure
[h264 @ 0x563d7945b680] mmco: unref short failure
[h264 @ 0x563d7945b680] mmco: unref short failure
[h264 @ 0x55ef08f34980] mmco: unref short failure
[h264 @ 0x55ef08f34980] mmco: unref short failure
[h264 @ 0x55ef09310680] mmco: unref short failure
[h264 @ 0x55ef09310680] mmco: unref short failure
[h264 @ 0x563d79b03d00] mmco: unref short failure
[h264 @ 0x563d79b03d00] mmco: unref short failure
[h264 @ 0x55ef09310680] mmco: unref short failure
[h264 @ 0x563d79b03d00] mmco: unref short failure
[h264 @ 0x55ef09310680] mmco: unref short failure
[h264 @ 0x55ef09310680] mmco: unref short failure
[h264 @ 0x563d79b03d00] mmco: unref short failure
[h264 @ 0x563d79b03d00] mmco: unref short failure
[h264 @ 0x55ef09310680] mmco: unref short failure
[h264 @ 0x55ef09310680] mmco: unref short failure
[h264 @ 0x563d79b03d00] mmco: unref short failure
[h264 @ 0x563d79b03d00] mmco: unref short failure
[h264 @ 0x55ef09310680] mmco: unref short failure
[h264 @ 0x563d79b03d00] mmco: unref short failure
[h264 @ 0x55ef0ce3e580] mmco: unref short failure
[h264 @ 0x55ef0ce3e580] mmco: unref short failure
[h264 @ 0x55ef0ce3e580] mmco: unref short failure
[h264 @ 0x55ef0ce3e580] mmco: unref short failure
[h264 @ 0x563d79152140] mmco: unref short failure
[h264 @ 0x563d79152140] mmco: unref short failure
[h264 @ 0x563d79152140] mmco: unref short failure
[h264 @ 0x563d79152140] mmco: unref short failure
[h264 @ 0x55ef0ce3e580] mmco: unref short failure
[h264 @ 0x55ef0ce3e580] mmco: unref short failure
[h264 @ 0x563d79152140] mmco: unref short failure
[h264 @ 0x563d79152140] mmco: unref short failure
[h264 @ 0x55ef0ac5e940] mmco: unref short failure
[h264 @ 0x55ef0ac5e940] mmco: unref short failure
[h264 @ 0x563d7aa00740] mmco: unref short failure
[h264 @ 0x563d7aa00740] mmco: unref short failure
[h264 @ 0x563d7a5354c0] mmco: unref short failure
[h264 @ 0x55ef086a4300] mmco: unref short failure
[h264 @ 0x563d7aa00740] mmco: unref short failure
[h264 @ 0x55ef0ac5e940] mmco: unref short failure
[h264 @ 0x55ef0ce3e580] mmco: unref short failure
[h264 @ 0x55ef0ce3e580] mmco: unref short failure
[h264 @ 0x563d79152140] mmco: unref short failure
[h264 @ 0x563d79152140] mmco: unref short failure
[h264 @ 0x55ef0ce3e580] mmco: unref short failure
[h264 @ 0x563d79152140] mmco: unref short failure
[h264 @ 0x563d7aa00740] mmco: unref short failure
[h264 @ 0x55ef0ac5e940] mmco: unref short failure
[h264 @ 0x55ef086a4300] mmco: unref short failure
[h264 @ 0x563d7a5354c0] mmco: unref short failure
[h264 @ 0x563d79152140] mmco: unref short failure
[h264 @ 0x563d79152140] mmco: unref short failure
[h264 @ 0x55ef0ce3e580] mmco: unref short failure
[h264 @ 0x55ef0ce3e580] mmco: unref short failure
[h264 @ 0x563d78965e40] mmco: unref short failure
[h264 @ 0x55ef09d9a940] mmco: unref short failure
[h264 @ 0x563d7aa00740] mmco: unref short failure
[h264 @ 0x55ef0ac5e940] mmco: unref short failure
[h264 @ 0x563d79c29240] mmco: unref short failure
[h264 @ 0x563d79c29240] mmco: unref short failure
[h264 @ 0x55ef0a4ff100] mmco: unref short failure
[h264 @ 0x55ef0a4ff100] mmco: unref short failure
[h264 @ 0x55ef0a4ff100] mmco: unref short failure
[h264 @ 0x563d79c29240] mmco: unref short failure
[h264 @ 0x563d7aa00740] mmco: unref short failure
[h264 @ 0x563d7aa00740] mmco: unref short failure
[h264 @ 0x55ef0ac5e940] mmco: unref short failure
[h264 @ 0x55ef0ac5e940] mmco: unref short failure
[h264 @ 0x55ef09d9a940] mmco: unref short failure
[h264 @ 0x563d7a391000] mmco: unref short failure
[h264 @ 0x55ef09d9a940] Missing reference picture, default is 65530
[h264 @ 0x55ef09d9a940] Missing reference picture, default is 65530
[h264 @ 0x55ef09d9a940] mmco: unref short failure
[h264 @ 0x55ef09d9a940] mmco: unref short failure
[h264 @ 0x55ef09d9a940] Missing reference picture, default is 65530
[h264 @ 0x55ef09d9a940] Missing reference picture, default is 65530
[h264 @ 0x55ef09d9a940] mmco: unref short failure
[h264 @ 0x55ef09d9a940] mmco: unref short failure
[h264 @ 0x563d7a391000] Missing reference picture, default is 65530
[h264 @ 0x563d7a391000] Missing reference picture, default is 65530
[h264 @ 0x563d7a391000] mmco: unref short failure
[h264 @ 0x563d7a391000] mmco: unref short failure
[h264 @ 0x55ef09d9a940] mmco: unref short failure
[h264 @ 0x55ef09d9a940] mmco: unref short failure
[h264 @ 0x563d7a391000] Missing reference picture, default is 65530
[h264 @ 0x563d7a391000] Missing reference picture, default is 65530
[h264 @ 0x563d7a391000] mmco: unref short failure
[h264 @ 0x563d7a391000] mmco: unref short failure
[h264 @ 0x563d7a391000] mmco: unref short failure
[h264 @ 0x563d7a391000] mmco: unref short failure
[h264 @ 0x55ef09d9a940] mmco: unref short failure
[h264 @ 0x563d7a391000] mmco: unref short failure
[h264 @ 0x55ef09d9a940] mmco: unref short failure
[h264 @ 0x55ef09d9a940] mmco: unref short failure
[h264 @ 0x563d7a391000] mmco: unref short failure
[h264 @ 0x563d7a391000] mmco: unref short failure
[h264 @ 0x563d8137ee00] mmco: unref short failure
[h264 @ 0x563d8137ee00] mmco: unref short failure
[h264 @ 0x55ef09b07ac0] mmco: unref short failure
[h264 @ 0x55ef09b07ac0] mmco: unref short failure
[h264 @ 0x55ef09d9a940] mmco: unref short failure
[h264 @ 0x55ef09d9a940] mmco: unref short failure
[h264 @ 0x563d7a391000] mmco: unref short failure
[h264 @ 0x563d7a391000] mmco: unref short failure
[h264 @ 0x55ef09d9a940] mmco: unref short failure
[h264 @ 0x55ef09d9a940] mmco: unref short failure
[h264 @ 0x563d785f04c0] mmco: unref short failure
[h264 @ 0x563d785f04c0] mmco: unref short failure
[h264 @ 0x55ef0ce3e580] mmco: unref short failure
[h264 @ 0x55ef0ce3e580] mmco: unref short failure
[h264 @ 0x563d79152140] mmco: unref short failure
[h264 @ 0x563d79152140] mmco: unref short failure
[h264 @ 0x55ef0ce3e580] mmco: unref short failure
[h264 @ 0x563d79152140] mmco: unref short failure
[h264 @ 0x563d786b2140] mmco: unref short failure
[h264 @ 0x563d786b2140] mmco: unref short failure
[h264 @ 0x563d79152140] mmco: unref short failure
[h264 @ 0x563d79152140] mmco: unref short failure
[h264 @ 0x55ef0ce3e580] mmco: unref short failure
[h264 @ 0x55ef0ce3e580] mmco: unref short failure
[h264 @ 0x55ef0bf94d00] mmco: unref short failure
[h264 @ 0x55ef0bf94d00] mmco: unref short failure
[h264 @ 0x563d786b2140] mmco: unref short failure
[h264 @ 0x563d786b2140] mmco: unref short failure
[h264 @ 0x563d79152140] mmco: unref short failure
[h264 @ 0x563d79152140] mmco: unref short failure
[h264 @ 0x55ef0ce3e580] mmco: unref short failure
[h264 @ 0x55ef0ce3e580] mmco: unref short failure
[h264 @ 0x563d786b2140] mmco: unref short failure
[h264 @ 0x563d786b2140] mmco: unref short failure
[h264 @ 0x563d786b2140] mmco: unref short failure
[h264 @ 0x55ef0bf94d00] mmco: unref short failure
[h264 @ 0x55ef0bf94d00] mmco: unref short failure
[h264 @ 0x55ef0bf94d00] mmco: unref short failure
[h264 @ 0x55ef0bf94d00] mmco: unref short failure
[h264 @ 0x55ef0bf94d00] mmco: unref short failure
[h264 @ 0x563d786b2140] mmco: unref short failure
[h264 @ 0x55ef0bf94d00] mmco: unref short failure
[h264 @ 0x563d79152140] mmco: unref short failure
[h264 @ 0x55ef097cf680] mmco: unref short failure
[h264 @ 0x55ef097cf680] mmco: unref short failure
[h264 @ 0x563d79152140] mmco: unref short failure
[h264 @ 0x55ef097cf680] mmco: unref short failure
[h264 @ 0x563d79152140] mmco: unref short failure
[h264 @ 0x55ef097cf680] mmco: unref short failure
[h264 @ 0x55ef097cf680] mmco: unref short failure
[h264 @ 0x563d79152140] mmco: unref short failure
[h264 @ 0x563d79152140] mmco: unref short failure
[h264 @ 0x55ef097cf680] mmco: unref short failure
[h264 @ 0x563d79152140] mmco: unref short failure
[h264 @ 0x55ef097cf680] mmco: unref short failure
[h264 @ 0x55ef097cf680] mmco: unref short failure
[h264 @ 0x563d79152140] mmco: unref short failure
[h264 @ 0x563d79152140] mmco: unref short failure
[h264 @ 0x55ef097cf680] mmco: unref short failure
[h264 @ 0x563d79152140] mmco: unref short failure
[h264 @ 0x55ef09486000] mmco: unref short failure
[h264 @ 0x55ef09486000] mmco: unref short failure
[h264 @ 0x563d78854cc0] mmco: unref short failure
[h264 @ 0x563d78854cc0] mmco: unref short failure
[h264 @ 0x55ef09486000] mmco: unref short failure
[h264 @ 0x55ef09486000] mmco: unref short failure
[h264 @ 0x563d78854cc0] mmco: unref short failure
[h264 @ 0x563d78854cc0] mmco: unref short failure
[h264 @ 0x55ef09486000] mmco: unref short failure
[h264 @ 0x563d78854cc0] mmco: unref short failure
[h264 @ 0x55ef09486000] mmco: unref short failure
[h264 @ 0x55ef09486000] mmco: unref short failure
[h264 @ 0x563d78854cc0] mmco: unref short failure
[h264 @ 0x563d78854cc0] mmco: unref short failure
[h264 @ 0x55ef09486000] mmco: unref short failure
[h264 @ 0x563d78854cc0] mmco: unref short failure
[h264 @ 0x563d79c29240] mmco: unref short failure
[h264 @ 0x563d79c29240] mmco: unref short failure
[h264 @ 0x55ef0a4ff100] mmco: unref short failure
[h264 @ 0x55ef0a4ff100] mmco: unref short failure
[h264 @ 0x563d79c29240] mmco: unref short failure
[h264 @ 0x55ef0a4ff100] mmco: unref short failure
[h264 @ 0x563d79c29240] mmco: unref short failure
[h264 @ 0x55ef0a4ff100] mmco: unref short failure
[h264 @ 0x563d79c29240] mmco: unref short failure
[h264 @ 0x55ef0a4ff100] mmco: unref short failure
[h264 @ 0x55ef0ab31f00] mmco: unref short failure
[h264 @ 0x563d7ba87580] mmco: unref short failure
[h264 @ 0x55ef0ce3e580] mmco: unref short failure
[h264 @ 0x55ef0ce3e580] mmco: unref short failure
[h264 @ 0x563d79152140] mmco: unref short failure
[h264 @ 0x563d79152140] mmco: unref short failure
[h264 @ 0x55ef09486000] mmco: unref short failure
[h264 @ 0x563d7a730d00] mmco: unref short failure
[h264 @ 0x563d78b95d80] mmco: unref short failure
[h264 @ 0x563d78b95d80] mmco: unref short failure
[h264 @ 0x55ef08f34980] mmco: unref short failure
[h264 @ 0x55ef08f34980] mmco: unref short failure
[h264 @ 0x563d78b95d80] mmco: unref short failure
[h264 @ 0x55ef08f34980] mmco: unref short failure
[h264 @ 0x563d78b95d80] mmco: unref short failure
[h264 @ 0x563d78b95d80] mmco: unref short failure
[h264 @ 0x55ef08f34980] mmco: unref short failure
[h264 @ 0x55ef08f34980] mmco: unref short failure
[h264 @ 0x563d7909ae00] mmco: unref short failure
[h264 @ 0x563d7909ae00] mmco: unref short failure
[h264 @ 0x55ef0a4ff100] mmco: unref short failure
[h264 @ 0x55ef0a4ff100] mmco: unref short failure
[h264 @ 0x563d7a730d00] mmco: unref short failure
[h264 @ 0x55ef09486000] mmco: unref short failure
[h264 @ 0x55ef08f34980] mmco: unref short failure
[h264 @ 0x55ef08f34980] mmco: unref short failure
[h264 @ 0x563d78b95d80] mmco: unref short failure
[h264 @ 0x563d78b95d80] mmco: unref short failure
[h264 @ 0x55ef08f34980] mmco: unref short failure
[h264 @ 0x563d78b95d80] mmco: unref short failure
[h264 @ 0x55ef08f34980] mmco: unref short failure
[h264 @ 0x55ef08f34980] mmco: unref short failure
[h264 @ 0x563d78b95d80] mmco: unref short failure
[h264 @ 0x563d78b95d80] mmco: unref short failure
[h264 @ 0x55ef08f34980] mmco: unref short failure
[h264 @ 0x563d78b95d80] mmco: unref short failure
[h264 @ 0x55ef0868b9c0] mmco: unref short failure
[h264 @ 0x55ef0868b9c0] mmco: unref short failure
[h264 @ 0x563d7ba87580] mmco: unref short failure
[h264 @ 0x563d7ba87580] mmco: unref short failure
[h264 @ 0x55ef0a4ff100] mmco: unref short failure
[h264 @ 0x55ef0a4ff100] mmco: unref short failure
[h264 @ 0x563d7909ae00] mmco: unref short failure
[h264 @ 0x563d7909ae00] mmco: unref short failure
[h264 @ 0x55ef0868b9c0] mmco: unref short failure
[h264 @ 0x563d7ba87580] mmco: unref short failure
[h264 @ 0x55ef0a4ff100] mmco: unref short failure
[h264 @ 0x55ef0a4ff100] mmco: unref short failure
[h264 @ 0x563d7909ae00] mmco: unref short failure
[h264 @ 0x563d7909ae00] mmco: unref short failure
[h264 @ 0x55ef0a4ff100] mmco: unref short failure
[h264 @ 0x55ef0a4ff100] mmco: unref short failure
[h264 @ 0x563d7909ae00] mmco: unref short failure
[h264 @ 0x563d7909ae00] mmco: unref short failure
[h264 @ 0x563d8137ee00] mmco: unref short failure
[h264 @ 0x563d8137ee00] mmco: unref short failure
[h264 @ 0x55ef09b07ac0] mmco: unref short failure
[h264 @ 0x55ef09b07ac0] mmco: unref short failure
[h264 @ 0x563d8137ee00] mmco: unref short failure
[h264 @ 0x55ef09b07ac0] mmco: unref short failure
[h264 @ 0x55ef09486000] mmco: unref short failure
[h264 @ 0x55ef09486000] mmco: unref short failure
[h264 @ 0x563d7a391000] mmco: unref short failure
[h264 @ 0x563d7a391000] mmco: unref short failure
[h264 @ 0x55ef09310680] mmco: unref short failure
[h264 @ 0x55ef09310680] mmco: unref short failure
[h264 @ 0x563d78481100] mmco: unref short failure
[h264 @ 0x563d78481100] mmco: unref short failure
[h264 @ 0x55ef0868b9c0] mmco: unref short failure
[h264 @ 0x563d7cf4ed40] mmco: unref short failure
[h264 @ 0x55ef0868b9c0] mmco: unref short failure
[h264 @ 0x55ef0868b9c0] mmco: unref short failure
[h264 @ 0x563d7cf4ed40] mmco: unref short failure
[h264 @ 0x563d7cf4ed40] mmco: unref short failure
[h264 @ 0x55ef09310680] mmco: unref short failure
[h264 @ 0x563d78481100] mmco: unref short failure
[h264 @ 0x55ef09486000] mmco: unref short failure
[h264 @ 0x563d7a391000] mmco: unref short failure
[h264 @ 0x55ef09486000] mmco: unref short failure
[h264 @ 0x55ef09486000] mmco: unref short failure
[h264 @ 0x563d7a391000] mmco: unref short failure
[h264 @ 0x563d7a391000] mmco: unref short failure
[h264 @ 0x55ef09486000] mmco: unref short failure
[h264 @ 0x55ef09486000] mmco: unref short failure
[h264 @ 0x563d7a391000] mmco: unref short failure
[h264 @ 0x563d7a391000] mmco: unref short failure
[h264 @ 0x55ef0868b9c0] mmco: unref short failure
[h264 @ 0x55ef0868b9c0] mmco: unref short failure
[h264 @ 0x563d7cf4ed40] mmco: unref short failure
[h264 @ 0x563d7cf4ed40] mmco: unref short failure
[h264 @ 0x55ef09486000] mmco: unref short failure
[h264 @ 0x55ef09486000] mmco: unref short failure
[h264 @ 0x563d7a391000] mmco: unref short failure
[h264 @ 0x563d7a391000] mmco: unref short failure
[h264 @ 0x55ef0868b9c0] mmco: unref short failure
[h264 @ 0x55ef0868b9c0] mmco: unref short failure
[h264 @ 0x563d7cf4ed40] mmco: unref short failure
[h264 @ 0x563d7cf4ed40] mmco: unref short failure
[h264 @ 0x55ef0a4ff100] mmco: unref short failure
[h264 @ 0x55ef0a4ff100] mmco: unref short failure
[h264 @ 0x55ef093fd000] mmco: unref short failure
[h264 @ 0x55ef093fd000] mmco: unref short failure
[h264 @ 0x563d7973ac40] mmco: unref short failure
[h264 @ 0x563d7973ac40] mmco: unref short failure
[h264 @ 0x563d7909ae00] mmco: unref short failure
[h264 @ 0x563d7909ae00] mmco: unref short failure
[h264 @ 0x55ef093fd000] mmco: unref short failure
[h264 @ 0x55ef093fd000] mmco: unref short failure
[h264 @ 0x563d7973ac40] mmco: unref short failure
[h264 @ 0x563d7973ac40] mmco: unref short failure
[h264 @ 0x55ef09310680] mmco: unref short failure
[h264 @ 0x55ef09310680] mmco: unref short failure
[h264 @ 0x563d78481100] mmco: unref short failure
[h264 @ 0x563d78481100] mmco: unref short failure
[h264 @ 0x55ef093fd000] mmco: unref short failure
[h264 @ 0x55ef093fd000] mmco: unref short failure
[h264 @ 0x563d7973ac40] mmco: unref short failure
[h264 @ 0x563d7973ac40] mmco: unref short failure
[h264 @ 0x563d7cf4ed40] mmco: unref short failure
[h264 @ 0x55ef0868b9c0] mmco: unref short failure
[h264 @ 0x563d7cf4ed40] mmco: unref short failure
[h264 @ 0x563d7cf4ed40] mmco: unref short failure
[h264 @ 0x55ef0868b9c0] mmco: unref short failure
[h264 @ 0x55ef0868b9c0] mmco: unref short failure
[h264 @ 0x55ef0868b9c0] mmco: unref short failure
[h264 @ 0x563d7cf4ed40] mmco: unref short failure
[h264 @ 0x55ef096f1180] mmco: unref short failure
[h264 @ 0x55ef096f1180] mmco: unref short failure
[h264 @ 0x563d79716280] mmco: unref short failure
[h264 @ 0x563d79716280] mmco: unref short failure
[h264 @ 0x55ef0868b9c0] mmco: unref short failure
[h264 @ 0x55ef0868b9c0] mmco: unref short failure
[h264 @ 0x563d7cf4ed40] mmco: unref short failure
[h264 @ 0x563d7cf4ed40] mmco: unref short failure
[h264 @ 0x563d79716280] mmco: unref short failure
[h264 @ 0x563d79716280] mmco: unref short failure
[h264 @ 0x55ef096f1180] mmco: unref short failure
[h264 @ 0x55ef096f1180] mmco: unref short failure
[h264 @ 0x55ef0868b9c0] mmco: unref short failure
[h264 @ 0x563d7cf4ed40] mmco: unref short failure
[h264 @ 0x55ef096f1180] mmco: unref short failure
[h264 @ 0x55ef096f1180] mmco: unref short failure
[h264 @ 0x563d79716280] mmco: unref short failure
[h264 @ 0x563d79716280] mmco: unref short failure
[h264 @ 0x55ef08434780] mmco: unref short failure
[h264 @ 0x55ef08434780] mmco: unref short failure
[h264 @ 0x563d7cf4ed40] mmco: unref short failure
[h264 @ 0x563d7cf4ed40] mmco: unref short failure
[h264 @ 0x55ef08434780] mmco: unref short failure
[h264 @ 0x563d7cf4ed40] mmco: unref short failure
[h264 @ 0x55ef09310680] Missing reference picture, default is 65530
[h264 @ 0x55ef09310680] Missing reference picture, default is 65530
[h264 @ 0x55ef09310680] mmco: unref short failure
[h264 @ 0x55ef09310680] mmco: unref short failure
[h264 @ 0x55ef09310680] Missing reference picture, default is 65530
[h264 @ 0x55ef09310680] Missing reference picture, default is 65530
[h264 @ 0x55ef09310680] mmco: unref short failure
[h264 @ 0x55ef09310680] mmco: unref short failure
[h264 @ 0x55ef09310680] mmco: unref short failure
[h264 @ 0x55ef09310680] mmco: unref short failure
[h264 @ 0x563d78481100] Missing reference picture, default is 65530
[h264 @ 0x563d78481100] Missing reference picture, default is 65530
[h264 @ 0x563d78481100] mmco: unref short failure
[h264 @ 0x563d78481100] mmco: unref short failure
[h264 @ 0x563d78481100] Missing reference picture, default is 65530
[h264 @ 0x563d78481100] Missing reference picture, default is 65530
[h264 @ 0x563d78481100] mmco: unref short failure
[h264 @ 0x563d78481100] mmco: unref short failure
[h264 @ 0x55ef0ac69ec0] mmco: unref short failure
[h264 @ 0x563d78481100] mmco: unref short failure
[h264 @ 0x563d78481100] mmco: unref short failure
[h264 @ 0x563d7b381340] mmco: unref short failure
[h264 @ 0x55ef09310680] mmco: unref short failure
[h264 @ 0x563d78481100] mmco: unref short failure
[h264 @ 0x55ef0ac69ec0] mmco: unref short failure
[h264 @ 0x55ef0ac69ec0] mmco: unref short failure
[h264 @ 0x563d7b381340] mmco: unref short failure
[h264 @ 0x563d7b381340] mmco: unref short failure
[h264 @ 0x55ef0ac69ec0] mmco: unref short failure
[h264 @ 0x563d7b381340] mmco: unref short failure
[h264 @ 0x55ef0ac69ec0] mmco: unref short failure
[h264 @ 0x55ef0ac69ec0] mmco: unref short failure
[h264 @ 0x55ef0ac69ec0] mmco: unref short failure
[h264 @ 0x563d7b381340] mmco: unref short failure
[h264 @ 0x563d7b381340] mmco: unref short failure
[h264 @ 0x563d7b381340] mmco: unref short failure
[h264 @ 0x563d7ce4b5c0] mmco: unref short failure
[h264 @ 0x55ef096f1180] mmco: unref short failure
[h264 @ 0x55ef0ac69ec0] mmco: unref short failure
[h264 @ 0x55ef0ac69ec0] mmco: unref short failure
[h264 @ 0x563d7ce4b5c0] mmco: unref short failure
[h264 @ 0x55ef096f1180] mmco: unref short failure
[h264 @ 0x563d7b381340] mmco: unref short failure
[h264 @ 0x563d7b381340] mmco: unref short failure
[h264 @ 0x55ef082d0700] mmco: unref short failure
[h264 @ 0x563d7a649840] mmco: unref short failure
[h264 @ 0x55ef082d0700] mmco: unref short failure
[h264 @ 0x563d7a649840] mmco: unref short failure
[h264 @ 0x55ef08434780] mmco: unref short failure
[h264 @ 0x55ef08434780] mmco: unref short failure
[h264 @ 0x55ef08434780] mmco: unref short failure
[h264 @ 0x563d7cf4ed40] mmco: unref short failure
[h264 @ 0x563d7cf4ed40] mmco: unref short failure
[h264 @ 0x563d7cf4ed40] mmco: unref short failure
[h264 @ 0x55ef096f1180] mmco: unref short failure
[h264 @ 0x563d7ce4b5c0] mmco: unref short failure
[h264 @ 0x55ef09486000] mmco: unref short failure
[h264 @ 0x55ef09486000] mmco: unref short failure
[h264 @ 0x55ef09486000] mmco: unref short failure
[h264 @ 0x563d7a391000] mmco: unref short failure
[h264 @ 0x563d7a391000] mmco: unref short failure
[h264 @ 0x563d7a391000] mmco: unref short failure
[h264 @ 0x55ef096f1180] mmco: unref short failure
[h264 @ 0x55ef0ac69ec0] mmco: unref short failure
[h264 @ 0x55ef0ac69ec0] mmco: unref short failure
[h264 @ 0x563d7ce4b5c0] mmco: unref short failure
[h264 @ 0x563d785f04c0] mmco: unref short failure
[h264 @ 0x563d785f04c0] mmco: unref short failure
[h264 @ 0x563d79b05a00] mmco: unref short failure
[h264 @ 0x563d79b05a00] mmco: unref short failure
[h264 @ 0x55ef096f1180] mmco: unref short failure
[h264 @ 0x55ef096f1180] mmco: unref short failure
[h264 @ 0x563d79b05a00] mmco: unref short failure
[h264 @ 0x563d79b05a00] mmco: unref short failure
[h264 @ 0x55ef096f1180] mmco: unref short failure
[h264 @ 0x55ef096f1180] mmco: unref short failure
[h264 @ 0x563d785f04c0] mmco: unref short failure
[h264 @ 0x55ef09e96ac0] mmco: unref short failure
[h264 @ 0x563d799ea700] mmco: unref short failure
[h264 @ 0x55ef08a13500] mmco: unref short failure
[h264 @ 0x55ef08a13500] mmco: unref short failure
[h264 @ 0x563d799ea700] mmco: unref short failure
[h264 @ 0x55ef0d0817c0] mmco: unref short failure
[h264 @ 0x55ef0d0817c0] mmco: unref short failure
[h264 @ 0x563d785f04c0] mmco: unref short failure
[h264 @ 0x563d785f04c0] mmco: unref short failure
[h264 @ 0x55ef0c609180] mmco: unref short failure
[h264 @ 0x563d7ede11c0] mmco: unref short failure
[h264 @ 0x55ef09e96ac0] mmco: unref short failure
[h264 @ 0x563d785f04c0] mmco: unref short failure
[h264 @ 0x55ef09310680] mmco: unref short failure
[h264 @ 0x55ef09310680] mmco: unref short failure
[h264 @ 0x563d78481100] mmco: unref short failure
[h264 @ 0x563d78481100] mmco: unref short failure
[h264 @ 0x55ef0cc8d540] mmco: unref short failure
[h264 @ 0x55ef0cc8d540] mmco: unref short failure
[h264 @ 0x563d785f04c0] mmco: unref short failure
[h264 @ 0x563d785f04c0] mmco: unref short failure
[h264 @ 0x55ef0cc8d540] mmco: unref short failure
[h264 @ 0x563d785f04c0] mmco: unref short failure
[h264 @ 0x55ef0cc8d540] mmco: unref short failure
[h264 @ 0x55ef0cc8d540] mmco: unref short failure
[h264 @ 0x563d785f04c0] mmco: unref short failure
[h264 @ 0x563d785f04c0] mmco: unref short failure
[h264 @ 0x55ef0cc8d540] mmco: unref short failure
[h264 @ 0x563d785f04c0] mmco: unref short failure
[h264 @ 0x55ef0cc8d540] mmco: unref short failure
[h264 @ 0x563d785f04c0] mmco: unref short failure
[h264 @ 0x563d78481100] mmco: unref short failure
[h264 @ 0x563d78481100] mmco: unref short failure
[h264 @ 0x55ef09310680] mmco: unref short failure
[h264 @ 0x55ef09310680] mmco: unref short failure
[h264 @ 0x55ef09bd1e40] mmco: unref short failure
[h264 @ 0x55ef09bd1e40] mmco: unref short failure
[h264 @ 0x563d785f04c0] mmco: unref short failure
[h264 @ 0x563d785f04c0] mmco: unref short failure
[h264 @ 0x563d78481100] mmco: unref short failure
[h264 @ 0x55ef09310680] mmco: unref short failure
[h264 @ 0x55ef09bd1e40] mmco: unref short failure
[h264 @ 0x55ef09bd1e40] mmco: unref short failure
[h264 @ 0x563d785f04c0] mmco: unref short failure
[h264 @ 0x563d785f04c0] mmco: unref short failure
[h264 @ 0x55ef09bd1e40] mmco: unref short failure
[h264 @ 0x55ef09bd1e40] mmco: unref short failure
[h264 @ 0x563d785f04c0] mmco: unref short failure
[h264 @ 0x563d785f04c0] mmco: unref short failure
[h264 @ 0x55ef084a7400] mmco: unref short failure
[h264 @ 0x563d790f2bc0] mmco: unref short failure
[h264 @ 0x55ef084a7400] mmco: unref short failure
[h264 @ 0x563d790f2bc0] mmco: unref short failure
[h264 @ 0x55ef0a0e7580] mmco: unref short failure
[h264 @ 0x55ef0a0e7580] mmco: unref short failure
[h264 @ 0x563d7a722400] mmco: unref short failure
[h264 @ 0x563d7a722400] mmco: unref short failure
[h264 @ 0x55ef0a4ff100] mmco: unref short failure
[h264 @ 0x55ef0a4ff100] mmco: unref short failure
[h264 @ 0x55ef0a4ff100] mmco: unref short failure
[h264 @ 0x55ef0a4ff100] mmco: unref short failure
[h264 @ 0x563d791e6cc0] mmco: unref short failure
[h264 @ 0x563d791e6cc0] mmco: unref short failure
[h264 @ 0x563d791e6cc0] mmco: unref short failure
[h264 @ 0x563d791e6cc0] mmco: unref short failure
[h264 @ 0x55ef0a4ff100] mmco: unref short failure
[h264 @ 0x55ef0a4ff100] mmco: unref short failure
[h264 @ 0x563d791e6cc0] mmco: unref short failure
[h264 @ 0x563d791e6cc0] mmco: unref short failure
[h264 @ 0x55ef0abb24c0] mmco: unref short failure
[h264 @ 0x55ef0abb24c0] mmco: unref short failure
[h264 @ 0x55ef0abb24c0] mmco: unref short failure
[h264 @ 0x563d791e6cc0] mmco: unref short failure
[h264 @ 0x563d791e6cc0] mmco: unref short failure
[h264 @ 0x563d791e6cc0] mmco: unref short failure
[h264 @ 0x55ef09bd1e40] mmco: unref short failure
[h264 @ 0x563d785f04c0] mmco: unref short failure
[h264 @ 0x55ef09bd1e40] mmco: unref short failure
[h264 @ 0x55ef09bd1e40] mmco: unref short failure
[h264 @ 0x563d785f04c0] mmco: unref short failure
[h264 @ 0x563d785f04c0] mmco: unref short failure
[h264 @ 0x55ef09bd1e40] mmco: unref short failure
[h264 @ 0x563d785f04c0] mmco: unref short failure
[h264 @ 0x563d791e6cc0] mmco: unref short failure
[h264 @ 0x55ef0abb24c0] mmco: unref short failure
[h264 @ 0x55ef09310680] mmco: unref short failure
[h264 @ 0x563d78481100] mmco: unref short failure
[h264 @ 0x563d7a391000] mmco: unref short failure
[h264 @ 0x563d7a391000] mmco: unref short failure
[h264 @ 0x55ef09486000] mmco: unref short failure
[h264 @ 0x55ef09486000] mmco: unref short failure
[h264 @ 0x55ef0a0e7580] mmco: unref short failure
[h264 @ 0x55ef0a0e7580] mmco: unref short failure
[h264 @ 0x563d7a722400] mmco: unref short failure
[h264 @ 0x563d7a722400] mmco: unref short failure
[h264 @ 0x563d790f2bc0] mmco: unref short failure
[h264 @ 0x55ef0a962d40] mmco: unref short failure
[h264 @ 0x55ef09486000] mmco: unref short failure
[h264 @ 0x55ef09486000] mmco: unref short failure
[h264 @ 0x563d7a391000] mmco: unref short failure
[h264 @ 0x563d7a391000] mmco: unref short failure
[h264 @ 0x563d7a391000] mmco: unref short failure
[h264 @ 0x563d7a391000] mmco: unref short failure
[h264 @ 0x55ef09486000] mmco: unref short failure
[h264 @ 0x55ef09486000] mmco: unref short failure
[h264 @ 0x563d790f2bc0] mmco: unref short failure
[h264 @ 0x55ef0a962d40] mmco: unref short failure
[h264 @ 0x563d790f2bc0] mmco: unref short failure
[h264 @ 0x563d790f2bc0] mmco: unref short failure
[h264 @ 0x55ef0a962d40] mmco: unref short failure
[h264 @ 0x55ef0a962d40] mmco: unref short failure
[h264 @ 0x563d790f2bc0] mmco: unref short failure
[h264 @ 0x55ef0a962d40] mmco: unref short failure
[h264 @ 0x563d790f2bc0] mmco: unref short failure
[h264 @ 0x563d790f2bc0] mmco: unref short failure
[h264 @ 0x55ef0a962d40] mmco: unref short failure
[h264 @ 0x55ef0a962d40] mmco: unref short failure
[h264 @ 0x563d790f2bc0] mmco: unref short failure
[h264 @ 0x55ef0a962d40] mmco: unref short failure
[h264 @ 0x563d790f2bc0] mmco: unref short failure
[h264 @ 0x55ef082db000] mmco: unref short failure
[h264 @ 0x55ef0abb24c0] mmco: unref short failure
[h264 @ 0x55ef0abb24c0] mmco: unref short failure
[h264 @ 0x55ef0abb24c0] mmco: unref short failure
[h264 @ 0x55ef0abb24c0] mmco: unref short failure
[h264 @ 0x55ef0abb24c0] mmco: unref short failure
[h264 @ 0x563d791e6cc0] mmco: unref short failure
[h264 @ 0x563d791e6cc0] mmco: unref short failure
[h264 @ 0x563d791e6cc0] mmco: unref short failure
[h264 @ 0x563d791e6cc0] mmco: unref short failure
[h264 @ 0x563d791e6cc0] mmco: unref short failure
[h264 @ 0x55ef0a823a00] mmco: unref short failure
[h264 @ 0x55ef0a823a00] mmco: unref short failure
[h264 @ 0x563d785f04c0] mmco: unref short failure
[h264 @ 0x563d785f04c0] mmco: unref short failure
[h264 @ 0x55ef09bd1e40] mmco: unref short failure
[h264 @ 0x563d78965e40] mmco: unref short failure
[h264 @ 0x55ef09bd1e40] mmco: unref short failure
[h264 @ 0x563d78965e40] mmco: unref short failure
[h264 @ 0x55ef0a4ff100] mmco: unref short failure
[h264 @ 0x55ef0a4ff100] mmco: unref short failure
[h264 @ 0x563d791e6cc0] mmco: unref short failure
[h264 @ 0x563d791e6cc0] mmco: unref short failure
[h264 @ 0x55ef08aa5000] mmco: unref short failure
[h264 @ 0x563d79259280] mmco: unref short failure
[h264 @ 0x55ef08ef8240] mmco: unref short failure
[h264 @ 0x563d79b05a00] mmco: unref short failure
[h264 @ 0x55ef0a8931c0] mmco: unref short failure
[h264 @ 0x55ef0a8931c0] mmco: unref short failure
[h264 @ 0x563d790f2bc0] mmco: unref short failure
[h264 @ 0x563d790f2bc0] mmco: unref short failure
[h264 @ 0x55ef0a8931c0] mmco: unref short failure
[h264 @ 0x563d790f2bc0] mmco: unref short failure
[h264 @ 0x55ef0abb24c0] mmco: unref short failure
[h264 @ 0x563d791e6cc0] mmco: unref short failure
[h264 @ 0x55ef0a8931c0] mmco: unref short failure
[h264 @ 0x55ef0a8931c0] mmco: unref short failure
[h264 @ 0x563d790f2bc0] mmco: unref short failure
[h264 @ 0x563d790f2bc0] mmco: unref short failure
[h264 @ 0x563d799ea700] mmco: unref short failure
[h264 @ 0x55ef093fd000] mmco: unref short failure
[h264 @ 0x563d799ea700] mmco: unref short failure
[h264 @ 0x55ef093fd000] mmco: unref short failure
[h264 @ 0x563d799ea700] mmco: unref short failure
[h264 @ 0x563d799ea700] mmco: unref short failure
[h264 @ 0x55ef093fd000] mmco: unref short failure
[h264 @ 0x55ef093fd000] mmco: unref short failure
[h264 @ 0x563d7a722400] mmco: unref short failure
[h264 @ 0x563d7a722400] mmco: unref short failure
[h264 @ 0x55ef08da0b00] mmco: unref short failure
[h264 @ 0x55ef08da0b00] mmco: unref short failure
[h264 @ 0x563d7a722400] mmco: unref short failure
[h264 @ 0x55ef08da0b00] mmco: unref short failure
[h264 @ 0x563d78965e40] mmco: unref short failure
[h264 @ 0x55ef08ef8240] mmco: unref short failure
[h264 @ 0x563d78965e40] mmco: unref short failure
[h264 @ 0x563d78965e40] mmco: unref short failure
[h264 @ 0x55ef08ef8240] mmco: unref short failure
[h264 @ 0x55ef08ef8240] mmco: unref short failure
[h264 @ 0x563d785f04c0] mmco: unref short failure
[h264 @ 0x55ef0a823a00] mmco: unref short failure
[h264 @ 0x563d7ce4b5c0] mmco: unref short failure
[h264 @ 0x55ef09bd1e40] mmco: unref short failure
[h264 @ 0x563d7ce4b5c0] mmco: unref short failure
[h264 @ 0x563d7ce4b5c0] mmco: unref short failure
[h264 @ 0x55ef09bd1e40] mmco: unref short failure
[h264 @ 0x55ef09bd1e40] mmco: unref short failure
[h264 @ 0x55ef082db000] mmco: unref short failure
[h264 @ 0x55ef082db000] mmco: unref short failure
[h264 @ 0x563d797a5f40] mmco: unref short failure
[h264 @ 0x563d797a5f40] mmco: unref short failure
[h264 @ 0x563d797a5f40] mmco: unref short failure
[h264 @ 0x563d797a5f40] mmco: unref short failure
[h264 @ 0x55ef082db000] mmco: unref short failure
[h264 @ 0x55ef082db000] mmco: unref short failure
[h264 @ 0x563d797a5f40] mmco: unref short failure
[h264 @ 0x563d797a5f40] mmco: unref short failure
[h264 @ 0x55ef082db000] mmco: unref short failure
[h264 @ 0x55ef082db000] mmco: unref short failure
[h264 @ 0x55ef084a7400] mmco: unref short failure
[h264 @ 0x55ef084a7400] mmco: unref short failure
[h264 @ 0x563d797a5f40] mmco: unref short failure
[h264 @ 0x563d797a5f40] mmco: unref short failure
[h264 @ 0x55ef0a4ff100] mmco: unref short failure
[h264 @ 0x563d791e6cc0] mmco: unref short failure
[h264 @ 0x55ef0a4ff100] mmco: unref short failure
[h264 @ 0x563d791e6cc0] mmco: unref short failure
[h264 @ 0x55ef0a4ff100] mmco: unref short failure
[h264 @ 0x55ef0a4ff100] mmco: unref short failure
[h264 @ 0x563d791e6cc0] mmco: unref short failure
[h264 @ 0x563d791e6cc0] mmco: unref short failure
[h264 @ 0x563d7a730d00] mmco: unref short failure
[h264 @ 0x55ef08ef8240] mmco: unref short failure
[h264 @ 0x55ef0a4ff100] mmco: unref short failure
[h264 @ 0x55ef0a4ff100] mmco: unref short failure
[h264 @ 0x563d791e6cc0] mmco: unref short failure
[h264 @ 0x563d791e6cc0] mmco: unref short failure
[h264 @ 0x55ef0923b380] mmco: unref short failure
[h264 @ 0x563d80059280] mmco: unref short failure
[h264 @ 0x55ef09bd1e40] mmco: unref short failure
[h264 @ 0x563d7ce4b5c0] mmco: unref short failure
[h264 @ 0x55ef0a4ff100] mmco: unref short failure
[h264 @ 0x563d791e6cc0] mmco: unref short failure
[h264 @ 0x55ef0923b380] mmco: unref short failure
[h264 @ 0x563d80059280] mmco: unref short failure
[h264 @ 0x55ef0d69af00] mmco: unref short failure
[h264 @ 0x55ef0d69af00] mmco: unref short failure
[h264 @ 0x563d7de7dd00] mmco: unref short failure
[h264 @ 0x563d7de7dd00] mmco: unref short failure
[h264 @ 0x55ef0d69af00] mmco: unref short failure
[h264 @ 0x563d7de7dd00] mmco: unref short failure
[h264 @ 0x563d7ce4b5c0] mmco: unref short failure
[h264 @ 0x563d7ce4b5c0] mmco: unref short failure
[h264 @ 0x55ef09bd1e40] mmco: unref short failure
[h264 @ 0x55ef09bd1e40] mmco: unref short failure
[h264 @ 0x563d7ce4b5c0] mmco: unref short failure
[h264 @ 0x55ef09bd1e40] mmco: unref short failure
[h264 @ 0x563d7de7dd00] mmco: unref short failure
[h264 @ 0x563d7de7dd00] mmco: unref short failure
[h264 @ 0x55ef0d69af00] mmco: unref short failure
[h264 @ 0x55ef0d69af00] mmco: unref short failure
[h264 @ 0x563d78e5bc00] mmco: unref short failure
[h264 @ 0x563d78e5bc00] mmco: unref short failure
[h264 @ 0x55ef0923b380] mmco: unref short failure
[h264 @ 0x55ef0923b380] mmco: unref short failure
[h264 @ 0x55ef0d69af00] mmco: unref short failure
[h264 @ 0x55ef0d69af00] mmco: unref short failure
[h264 @ 0x563d7de7dd00] mmco: unref short failure
[h264 @ 0x563d7de7dd00] mmco: unref short failure
[h264 @ 0x55ef0d69af00] mmco: unref short failure
[h264 @ 0x55ef0d69af00] mmco: unref short failure
[h264 @ 0x563d7de7dd00] mmco: unref short failure
[h264 @ 0x563d7de7dd00] mmco: unref short failure
[h264 @ 0x563d78e5bc00] mmco: unref short failure
[h264 @ 0x563d78e5bc00] mmco: unref short failure
[h264 @ 0x55ef0923b380] mmco: unref short failure
[h264 @ 0x55ef0923b380] mmco: unref short failure
[h264 @ 0x563d78e5bc00] mmco: unref short failure
[h264 @ 0x563d78e5bc00] mmco: unref short failure
[h264 @ 0x55ef0923b380] mmco: unref short failure
[h264 @ 0x55ef0923b380] mmco: unref short failure
[h264 @ 0x55ef0d69af00] mmco: unref short failure
[h264 @ 0x55ef0d69af00] mmco: unref short failure
[h264 @ 0x563d7de7dd00] mmco: unref short failure
[h264 @ 0x563d7de7dd00] mmco: unref short failure
[h264 @ 0x55ef0d69af00] mmco: unref short failure
[h264 @ 0x563d7de7dd00] mmco: unref short failure
[h264 @ 0x55ef0a4ff100] mmco: unref short failure
[h264 @ 0x563d7ab61640] mmco: unref short failure
[h264 @ 0x55ef084a7400] mmco: unref short failure
[h264 @ 0x55ef084a7400] mmco: unref short failure
[h264 @ 0x55ef0a4ff100] mmco: unref short failure
[h264 @ 0x563d7f409000] mmco: unref short failure
[h264 @ 0x563d7f409000] mmco: unref short failure
[h264 @ 0x563d7ab61640] mmco: unref short failure
[h264 @ 0x55ef0a4ff100] mmco: unref short failure
[h264 @ 0x563d7ab61640] mmco: unref short failure
[h264 @ 0x55ef08bb4480] mmco: unref short failure
[h264 @ 0x563d7aa4d6c0] mmco: unref short failure
[h264 @ 0x55ef09310680] mmco: unref short failure
[h264 @ 0x55ef09310680] mmco: unref short failure
[h264 @ 0x563d79430500] mmco: unref short failure
[h264 @ 0x563d79430500] mmco: unref short failure
[h264 @ 0x55ef09310680] mmco: unref short failure
[h264 @ 0x55ef09310680] mmco: unref short failure
[h264 @ 0x563d79430500] mmco: unref short failure
[h264 @ 0x563d79430500] mmco: unref short failure
[h264 @ 0x563d79152140] mmco: unref short failure
[h264 @ 0x55ef07ff0680] mmco: unref short failure
[h264 @ 0x563d7f1dbd40] mmco: unref short failure
[h264 @ 0x563d7f1dbd40] mmco: unref short failure
[h264 @ 0x55ef0a4ff100] mmco: unref short failure
[h264 @ 0x55ef0a4ff100] mmco: unref short failure
[h264 @ 0x55ef0923b380] mmco: unref short failure
[h264 @ 0x563d80059280] mmco: unref short failure
[h264 @ 0x55ef0923b380] mmco: unref short failure
[h264 @ 0x563d80059280] mmco: unref short failure
[h264 @ 0x55ef0923b380] mmco: unref short failure
[h264 @ 0x55ef0923b380] mmco: unref short failure
[h264 @ 0x563d80059280] mmco: unref short failure
[h264 @ 0x563d80059280] mmco: unref short failure
[h264 @ 0x55ef0923b380] mmco: unref short failure
[h264 @ 0x55ef0923b380] mmco: unref short failure
[h264 @ 0x563d80059280] mmco: unref short failure
[h264 @ 0x563d80059280] mmco: unref short failure
[h264 @ 0x55ef098c3500] mmco: unref short failure
[h264 @ 0x55ef098c3500] mmco: unref short failure
[h264 @ 0x563d7aa4d6c0] mmco: unref short failure
[h264 @ 0x563d7aa4d6c0] mmco: unref short failure
[h264 @ 0x563d7aa4d6c0] mmco: unref short failure
[h264 @ 0x55ef098c3500] mmco: unref short failure
[h264 @ 0x563d7aa4d6c0] mmco: unref short failure
[h264 @ 0x563d7aa4d6c0] mmco: unref short failure
[h264 @ 0x55ef098c3500] mmco: unref short failure
[h264 @ 0x55ef098c3500] mmco: unref short failure
[h264 @ 0x563d7aa4d6c0] mmco: unref short failure
[h264 @ 0x563d7aa4d6c0] mmco: unref short failure
[h264 @ 0x55ef098c3500] mmco: unref short failure
[h264 @ 0x55ef098c3500] mmco: unref short failure
[h264 @ 0x55ef08da0b00] mmco: unref short failure
[h264 @ 0x563d7ad55d40] mmco: unref short failure
[h264 @ 0x563d7ad55d40] mmco: unref short failure
[h264 @ 0x55ef08da0b00] mmco: unref short failure
[h264 @ 0x55ef098c3500] mmco: unref short failure
[h264 @ 0x563d7aa4d6c0] mmco: unref short failure
[h264 @ 0x563d80059280] mmco: unref short failure
[h264 @ 0x55ef0923b380] mmco: unref short failure
[h264 @ 0x563d7ad55d40] mmco: unref short failure
[h264 @ 0x563d7ad55d40] mmco: unref short failure
[h264 @ 0x55ef08da0b00] mmco: unref short failure
[h264 @ 0x55ef08da0b00] mmco: unref short failure
[h264 @ 0x563d80059280] mmco: unref short failure
[h264 @ 0x563d80059280] mmco: unref short failure
[h264 @ 0x55ef0923b380] mmco: unref short failure
[h264 @ 0x55ef0923b380] mmco: unref short failure
[h264 @ 0x563d7ad55d40] mmco: unref short failure
[h264 @ 0x563d7ad55d40] mmco: unref short failure
[h264 @ 0x55ef08da0b00] mmco: unref short failure
[h264 @ 0x55ef08da0b00] mmco: unref short failure
[h264 @ 0x563d80059280] mmco: unref short failure
[h264 @ 0x563d80059280] mmco: unref short failure
[h264 @ 0x55ef0923b380] mmco: unref short failure
[h264 @ 0x55ef0923b380] mmco: unref short failure
[h264 @ 0x563d80059280] mmco: unref short failure
[h264 @ 0x55ef0923b380] mmco: unref short failure
[h264 @ 0x563d80059280] mmco: unref short failure
[h264 @ 0x563d80059280] mmco: unref short failure
[h264 @ 0x55ef0923b380] mmco: unref short failure
[h264 @ 0x55ef0923b380] mmco: unref short failure
[h264 @ 0x563d7973ac40] mmco: unref short failure
[h264 @ 0x563d7973ac40] mmco: unref short failure
[h264 @ 0x55ef08da0b00] mmco: unref short failure
[h264 @ 0x55ef08da0b00] mmco: unref short failure
[h264 @ 0x563d79430500] mmco: unref short failure
[h264 @ 0x55ef09310680] mmco: unref short failure
[h264 @ 0x55ef09f6e780] mmco: unref short failure
[h264 @ 0x55ef09f6e780] mmco: unref short failure
[h264 @ 0x563d79152140] mmco: unref short failure
[h264 @ 0x563d79152140] mmco: unref short failure
[h264 @ 0x563d7f409000] mmco: unref short failure
[h264 @ 0x563d7f409000] mmco: unref short failure
[h264 @ 0x55ef0aadf440] mmco: unref short failure
[h264 @ 0x55ef0aadf440] mmco: unref short failure
[h264 @ 0x563d7f409000] Missing reference picture, default is 65530
[h264 @ 0x563d7f409000] mmco: unref short failure
[h264 @ 0x563d7f409000] mmco: unref short failure
[h264 @ 0x563d7f409000] Missing reference picture, default is 65530
[h264 @ 0x563d7f409000] mmco: unref short failure
[h264 @ 0x563d7f409000] mmco: unref short failure
[h264 @ 0x55ef0aadf440] Missing reference picture, default is 65530
[h264 @ 0x55ef0aadf440] mmco: unref short failure
[h264 @ 0x55ef0aadf440] mmco: unref short failure
[h264 @ 0x55ef0aadf440] Missing reference picture, default is 65530
[h264 @ 0x55ef0aadf440] mmco: unref short failure
[h264 @ 0x55ef0aadf440] mmco: unref short failure
[h264 @ 0x55ef09bd1e40] mmco: unref short failure
[h264 @ 0x55ef09bd1e40] mmco: unref short failure
[h264 @ 0x55ef0923b380] mmco: unref short failure
[h264 @ 0x55ef0923b380] mmco: unref short failure
[h264 @ 0x563d7ce4b5c0] mmco: unref short failure
[h264 @ 0x563d7ce4b5c0] mmco: unref short failure
[h264 @ 0x563d80059280] mmco: unref short failure
[h264 @ 0x563d80059280] mmco: unref short failure
[h264 @ 0x55ef0923b380] mmco: unref short failure
[h264 @ 0x55ef0923b380] mmco: unref short failure
[h264 @ 0x563d80059280] mmco: unref short failure
[h264 @ 0x563d80059280] mmco: unref short failure
[h264 @ 0x55ef09bd1e40] mmco: unref short failure
[h264 @ 0x55ef09bd1e40] mmco: unref short failure
[h264 @ 0x563d7ce4b5c0] mmco: unref short failure
[h264 @ 0x563d7ce4b5c0] mmco: unref short failure
[h264 @ 0x55ef113c5d00] mmco: unref short failure
[h264 @ 0x55ef113c5d00] mmco: unref short failure
[h264 @ 0x563d7944a640] mmco: unref short failure
[h264 @ 0x563d7944a640] mmco: unref short failure
[h264 @ 0x563d7944a640] mmco: unref short failure
[h264 @ 0x55ef113c5d00] mmco: unref short failure
[h264 @ 0x563d7944a640] mmco: unref short failure
[h264 @ 0x55ef113c5d00] mmco: unref short failure
[h264 @ 0x55ef0a763a00] mmco: unref short failure
[h264 @ 0x563d8102b140] mmco: unref short failure
[h264 @ 0x55ef113c5d00] mmco: unref short failure
[h264 @ 0x563d7944a640] mmco: unref short failure
[h264 @ 0x563d7a391000] mmco: unref short failure
[h264 @ 0x55ef098f0800] mmco: unref short failure
[h264 @ 0x563d7a391000] mmco: unref short failure
[h264 @ 0x55ef098f0800] mmco: unref short failure
[h264 @ 0x55ef080df5c0] mmco: unref short failure
[h264 @ 0x563d78965e40] mmco: unref short failure
[h264 @ 0x563d78965e40] mmco: unref short failure
[h264 @ 0x55ef080df5c0] mmco: unref short failure
[h264 @ 0x563d78965e40] mmco: unref short failure
[h264 @ 0x55ef080df5c0] mmco: unref short failure
[h264 @ 0x563d78965e40] mmco: unref short failure
[h264 @ 0x55ef080df5c0] mmco: unref short failure
[h264 @ 0x563d78965e40] mmco: unref short failure
[h264 @ 0x55ef080df5c0] mmco: unref short failure
[h264 @ 0x563d7b050400] mmco: unref short failure
[h264 @ 0x55ef08a13500] mmco: unref short failure
[h264 @ 0x563d7b050400] mmco: unref short failure
[h264 @ 0x55ef08a13500] mmco: unref short failure
[h264 @ 0x55ef08bb4480] mmco: unref short failure
[h264 @ 0x55ef08bb4480] mmco: unref short failure
[h264 @ 0x563d79b05a00] mmco: unref short failure
[h264 @ 0x563d79b05a00] mmco: unref short failure
[h264 @ 0x55ef095231c0] mmco: unref short failure
[h264 @ 0x55ef095231c0] mmco: unref short failure
[h264 @ 0x563d7854ed00] mmco: unref short failure
[h264 @ 0x563d7854ed00] mmco: unref short failure
[h264 @ 0x55ef08a13500] mmco: unref short failure
[h264 @ 0x55ef08a13500] mmco: unref short failure
[h264 @ 0x563d7b050400] mmco: unref short failure
[h264 @ 0x563d7b050400] mmco: unref short failure
[h264 @ 0x55ef08a13500] mmco: unref short failure
[h264 @ 0x563d7b050400] mmco: unref short failure
[h264 @ 0x55ef09718280] mmco: unref short failure
[h264 @ 0x55ef09718280] mmco: unref short failure
[h264 @ 0x563d7a734f40] mmco: unref short failure
[h264 @ 0x563d7a734f40] mmco: unref short failure
[h264 @ 0x55ef09718280] mmco: unref short failure
[h264 @ 0x563d7a734f40] mmco: unref short failure
[h264 @ 0x55ef08efc800] mmco: unref short failure
[h264 @ 0x563d7a499000] mmco: unref short failure
[h264 @ 0x55ef08434780] mmco: unref short failure
[h264 @ 0x563d7aa4a500] mmco: unref short failure
[h264 @ 0x55ef08434780] mmco: unref short failure
[h264 @ 0x55ef08434780] mmco: unref short failure
[h264 @ 0x563d7aa4a500] mmco: unref short failure
[h264 @ 0x563d7aa4a500] mmco: unref short failure
[h264 @ 0x563d79152140] mmco: unref short failure
[h264 @ 0x563d79152140] mmco: unref short failure
[h264 @ 0x55ef09f6e780] mmco: unref short failure
[h264 @ 0x55ef09f6e780] mmco: unref short failure
[h264 @ 0x55ef08aa5000] mmco: unref short failure
[h264 @ 0x55ef08aa5000] mmco: unref short failure
[h264 @ 0x563d7a499000] mmco: unref short failure
[h264 @ 0x563d7a499000] mmco: unref short failure
[h264 @ 0x563d790f2bc0] mmco: unref short failure
[h264 @ 0x55ef07f7c280] mmco: unref short failure
[h264 @ 0x55ef08434780] mmco: unref short failure
[h264 @ 0x563d7aa4a500] mmco: unref short failure
[h264 @ 0x55ef09f6e780] mmco: unref short failure
[h264 @ 0x55ef09f6e780] mmco: unref short failure
[h264 @ 0x563d79152140] mmco: unref short failure
[h264 @ 0x563d79152140] mmco: unref short failure
[h264 @ 0x55ef09bd1e40] mmco: unref short failure
[h264 @ 0x55ef09bd1e40] mmco: unref short failure
[h264 @ 0x563d7ce4b5c0] mmco: unref short failure
[h264 @ 0x563d7ce4b5c0] mmco: unref short failure
[h264 @ 0x55ef08aa5000] mmco: unref short failure
[h264 @ 0x55ef08aa5000] mmco: unref short failure
[h264 @ 0x563d7a499000] mmco: unref short failure
[h264 @ 0x563d7a499000] mmco: unref short failure
[h264 @ 0x55ef0ab24740] mmco: unref short failure
[h264 @ 0x55ef0ab24740] mmco: unref short failure
[h264 @ 0x563d78481100] mmco: unref short failure
[h264 @ 0x563d78481100] mmco: unref short failure
[h264 @ 0x55ef0941a540] mmco: unref short failure
[h264 @ 0x563d78e5bc00] mmco: unref short failure
[h264 @ 0x563d7cf4ed40] mmco: unref short failure
[h264 @ 0x563d7cf4ed40] mmco: unref short failure
[h264 @ 0x55ef0a1c5980] mmco: unref short failure
[h264 @ 0x55ef0a1c5980] mmco: unref short failure
[h264 @ 0x563d7cf4ed40] mmco: unref short failure
[h264 @ 0x55ef0a1c5980] mmco: unref short failure
[h264 @ 0x55ef09a8ba80] mmco: unref short failure
[h264 @ 0x55ef09a8ba80] mmco: unref short failure
[h264 @ 0x563d7973ac40] mmco: unref short failure
[h264 @ 0x563d7973ac40] mmco: unref short failure
[h264 @ 0x563d79152140] mmco: unref short failure
[h264 @ 0x55ef09f6e780] mmco: unref short failure
[h264 @ 0x55ef09f6e780] mmco: unref short failure
[h264 @ 0x563d79152140] mmco: unref short failure
[h264 @ 0x55ef09a8ba80] mmco: unref short failure
[h264 @ 0x55ef09a8ba80] mmco: unref short failure
[h264 @ 0x563d7973ac40] mmco: unref short failure
[h264 @ 0x563d7973ac40] mmco: unref short failure
[h264 @ 0x55ef09a8ba80] mmco: unref short failure
[h264 @ 0x55ef09a8ba80] mmco: unref short failure
[h264 @ 0x563d7973ac40] mmco: unref short failure
[h264 @ 0x563d7973ac40] mmco: unref short failure
[h264 @ 0x563d78e5bc00] mmco: unref short failure
[h264 @ 0x55ef0941a540] mmco: unref short failure
[h264 @ 0x563d78e5bc00] mmco: unref short failure
[h264 @ 0x55ef0941a540] mmco: unref short failure
[h264 @ 0x55ef08720c80] mmco: unref short failure
[h264 @ 0x55ef08720c80] mmco: unref short failure
[h264 @ 0x563d790f2bc0] mmco: unref short failure
[h264 @ 0x563d790f2bc0] mmco: unref short failure
[h264 @ 0x55ef08720c80] mmco: unref short failure
[h264 @ 0x55ef08720c80] mmco: unref short failure
[h264 @ 0x563d790f2bc0] mmco: unref short failure
[h264 @ 0x563d790f2bc0] mmco: unref short failure
[h264 @ 0x55ef0a99d440] mmco: unref short failure
[h264 @ 0x55ef0a99d440] mmco: unref short failure
[h264 @ 0x563d78c05240] mmco: unref short failure
[h264 @ 0x563d78c05240] mmco: unref short failure
[h264 @ 0x55ef08bb1f80] mmco: unref short failure
[h264 @ 0x55ef08bb1f80] mmco: unref short failure
[h264 @ 0x563d79b720c0] mmco: unref short failure
[h264 @ 0x563d79b720c0] mmco: unref short failure
[h264 @ 0x55ef0c609180] mmco: unref short failure
[h264 @ 0x55ef0c609180] mmco: unref short failure
[h264 @ 0x563d78edf780] mmco: unref short failure
[h264 @ 0x563d78edf780] mmco: unref short failure
[h264 @ 0x55ef09e96ac0] mmco: unref short failure
[h264 @ 0x55ef09e96ac0] mmco: unref short failure
[h264 @ 0x563d7ce4b5c0] mmco: unref short failure
[h264 @ 0x563d7ce4b5c0] mmco: unref short failure
[h264 @ 0x55ef08720c80] mmco: unref short failure
[h264 @ 0x563d790f2bc0] mmco: unref short failure
[h264 @ 0x563d790f2bc0] mmco: unref short failure
[h264 @ 0x563d790f2bc0] mmco: unref short failure
[h264 @ 0x55ef08720c80] mmco: unref short failure
[h264 @ 0x55ef08720c80] mmco: unref short failure
[h264 @ 0x563d790f2bc0] mmco: unref short failure
[h264 @ 0x55ef08720c80] mmco: unref short failure
[h264 @ 0x563d790f2bc0] mmco: unref short failure
[h264 @ 0x563d790f2bc0] mmco: unref short failure
[h264 @ 0x55ef08720c80] mmco: unref short failure
[h264 @ 0x55ef08720c80] mmco: unref short failure
[h264 @ 0x563d790f2bc0] mmco: unref short failure
[h264 @ 0x563d790f2bc0] mmco: unref short failure
[h264 @ 0x55ef08720c80] mmco: unref short failure
[h264 @ 0x55ef08720c80] mmco: unref short failure
[h264 @ 0x563d790f2bc0] mmco: unref short failure
[h264 @ 0x55ef08720c80] mmco: unref short failure
[h264 @ 0x563d790f2bc0] mmco: unref short failure
[h264 @ 0x563d790f2bc0] mmco: unref short failure
[h264 @ 0x55ef08720c80] mmco: unref short failure
[h264 @ 0x55ef08720c80] mmco: unref short failure
[h264 @ 0x563d790f2bc0] mmco: unref short failure
[h264 @ 0x55ef08720c80] mmco: unref short failure
[h264 @ 0x563d790f2bc0] mmco: unref short failure
[h264 @ 0x563d790f2bc0] mmco: unref short failure
[h264 @ 0x55ef08720c80] mmco: unref short failure
[h264 @ 0x55ef08720c80] mmco: unref short failure
[h264 @ 0x55ef09e96ac0] mmco: unref short failure
[h264 @ 0x563d7ce4b5c0] mmco: unref short failure
[h264 @ 0x55ef09e96ac0] mmco: unref short failure
[h264 @ 0x55ef09e96ac0] mmco: unref short failure
[h264 @ 0x55ef0941a540] mmco: unref short failure
[h264 @ 0x55ef0941a540] mmco: unref short failure
[h264 @ 0x563d7ce4b5c0] mmco: unref short failure
[h264 @ 0x563d7ce4b5c0] mmco: unref short failure
[h264 @ 0x563d78e5bc00] mmco: unref short failure
[h264 @ 0x563d78e5bc00] mmco: unref short failure
[h264 @ 0x55ef09e96ac0] mmco: unref short failure
[h264 @ 0x563d7ce4b5c0] mmco: unref short failure
[h264 @ 0x55ef0a1c5980] mmco: unref short failure
[h264 @ 0x55ef0a1c5980] mmco: unref short failure
[h264 @ 0x563d7973ac40] mmco: unref short failure
[h264 @ 0x563d7973ac40] mmco: unref short failure
[h264 @ 0x55ef0a1c5980] mmco: unref short failure
[h264 @ 0x55ef0a1c5980] mmco: unref short failure
[h264 @ 0x563d7973ac40] mmco: unref short failure
[h264 @ 0x563d7973ac40] mmco: unref short failure
[h264 @ 0x55ef0a1c5980] mmco: unref short failure
[h264 @ 0x563d7973ac40] mmco: unref short failure
[h264 @ 0x55ef0a1c5980] mmco: unref short failure
[h264 @ 0x55ef0a1c5980] mmco: unref short failure
[h264 @ 0x55ef0a1c5980] mmco: unref short failure
[h264 @ 0x563d7973ac40] mmco: unref short failure
[h264 @ 0x563d7973ac40] mmco: unref short failure
[h264 @ 0x563d7973ac40] mmco: unref short failure
[h264 @ 0x55ef0a1c5980] mmco: unref short failure
[h264 @ 0x563d7973ac40] mmco: unref short failure
[h264 @ 0x55ef0a1c5980] mmco: unref short failure
[h264 @ 0x55ef0a1c5980] mmco: unref short failure
[h264 @ 0x563d7973ac40] mmco: unref short failure
[h264 @ 0x563d7973ac40] mmco: unref short failure
[h264 @ 0x55ef0c609180] mmco: unref short failure
[h264 @ 0x55ef0c609180] mmco: unref short failure
[h264 @ 0x55ef08a3be00] mmco: unref short failure
[h264 @ 0x55ef08a3be00] mmco: unref short failure
[h264 @ 0x563d8113fbc0] mmco: unref short failure
[h264 @ 0x563d8113fbc0] mmco: unref short failure
[h264 @ 0x563d79b720c0] mmco: unref short failure
[h264 @ 0x563d79b720c0] mmco: unref short failure
[h264 @ 0x55ef0c609180] mmco: unref short failure
[h264 @ 0x563d8113fbc0] mmco: unref short failure
[h264 @ 0x563d8113fbc0] mmco: unref short failure
[h264 @ 0x563d8113fbc0] mmco: unref short failure
[h264 @ 0x55ef0c609180] mmco: unref short failure
[h264 @ 0x55ef0c609180] mmco: unref short failure
[h264 @ 0x55ef09b07ac0] mmco: unref short failure
[h264 @ 0x563d799ea700] mmco: unref short failure
[h264 @ 0x55ef09b07ac0] mmco: unref short failure
[h264 @ 0x563d799ea700] mmco: unref short failure
[h264 @ 0x55ef08aa5000] mmco: unref short failure
[h264 @ 0x55ef08aa5000] mmco: unref short failure
[h264 @ 0x563d78edf780] mmco: unref short failure
[h264 @ 0x563d78edf780] mmco: unref short failure
[h264 @ 0x55ef08aa5000] mmco: unref short failure
[h264 @ 0x563d78edf780] mmco: unref short failure
[h264 @ 0x55ef08aa5000] mmco: unref short failure
[h264 @ 0x563d78edf780] mmco: unref short failure
[h264 @ 0x55ef0923b380] mmco: unref short failure
[h264 @ 0x55ef0923b380] mmco: unref short failure
[h264 @ 0x55ef0923b380] mmco: unref short failure
[h264 @ 0x563d79a79e40] mmco: unref short failure
[h264 @ 0x563d79a79e40] mmco: unref short failure
[h264 @ 0x563d79a79e40] mmco: unref short failure
[h264 @ 0x55ef0a1c5980] mmco: unref short failure
[h264 @ 0x55ef08bb4480] mmco: unref short failure
[h264 @ 0x563d7973ac40] mmco: unref short failure
[h264 @ 0x55ef0923b380] mmco: unref short failure
[h264 @ 0x563d7ce4b5c0] mmco: unref short failure
[h264 @ 0x563d79a79e40] mmco: unref short failure
[h264 @ 0x55ef08aa5000] mmco: unref short failure
[h264 @ 0x563d78edf780] mmco: unref short failure
[h264 @ 0x55ef0923b380] mmco: unref short failure
[h264 @ 0x563d79a79e40] mmco: unref short failure
[h264 @ 0x55ef0923b380] mmco: unref short failure
[h264 @ 0x55ef0923b380] mmco: unref short failure
[h264 @ 0x55ef0923b380] mmco: unref short failure
[h264 @ 0x563d79a79e40] mmco: unref short failure
[h264 @ 0x563d79a79e40] mmco: unref short failure
[h264 @ 0x563d79a79e40] mmco: unref short failure
[h264 @ 0x55ef0ab31f00] mmco: unref short failure
[h264 @ 0x55ef0a99d440] mmco: unref short failure
[h264 @ 0x563d78c05240] mmco: unref short failure
[h264 @ 0x563d7973ac40] mmco: unref short failure
[h264 @ 0x55ef0ab31f00] mmco: unref short failure
[h264 @ 0x563d7973ac40] mmco: unref short failure
[h264 @ 0x55ef0ab31f00] mmco: unref short failure
[h264 @ 0x563d7973ac40] mmco: unref short failure
[h264 @ 0x55ef08aa5000] mmco: unref short failure
[h264 @ 0x563d78b95d80] mmco: unref short failure
[h264 @ 0x55ef0abb24c0] mmco: unref short failure
[h264 @ 0x563d78b95d80] mmco: unref short failure
[h264 @ 0x55ef113c5d00] mmco: unref short failure
[h264 @ 0x55ef09486000] mmco: unref short failure
[h264 @ 0x563d78976080] mmco: unref short failure
[h264 @ 0x563d7ce4b5c0] mmco: unref short failure
[h264 @ 0x55ef09486000] mmco: unref short failure
[h264 @ 0x55ef09486000] mmco: unref short failure
[h264 @ 0x55ef09486000] mmco: unref short failure
[h264 @ 0x55ef09486000] mmco: unref short failure
[h264 @ 0x563d7ce4b5c0] mmco: unref short failure
[h264 @ 0x563d7ce4b5c0] mmco: unref short failure
[h264 @ 0x563d7ce4b5c0] mmco: unref short failure
[h264 @ 0x563d7ce4b5c0] mmco: unref short failure
[h264 @ 0x563d799ea700] mmco: unref short failure
[h264 @ 0x563d799ea700] mmco: unref short failure
[h264 @ 0x55ef09b07ac0] mmco: unref short failure
[h264 @ 0x55ef09b07ac0] mmco: unref short failure
[h264 @ 0x563d799ea700] mmco: unref short failure
[h264 @ 0x55ef09b07ac0] mmco: unref short failure
[h264 @ 0x563d799ea700] mmco: unref short failure
[h264 @ 0x55ef09b07ac0] mmco: unref short failure
[h264 @ 0x563d799ea700] mmco: unref short failure
[h264 @ 0x55ef09b07ac0] mmco: unref short failure
[h264 @ 0x563d799ea700] mmco: unref short failure
[h264 @ 0x563d799ea700] mmco: unref short failure
[h264 @ 0x55ef09b07ac0] mmco: unref short failure
[h264 @ 0x55ef09b07ac0] mmco: unref short failure
[h264 @ 0x563d799ea700] mmco: unref short failure
[h264 @ 0x563d799ea700] mmco: unref short failure
[h264 @ 0x55ef09b07ac0] mmco: unref short failure
[h264 @ 0x55ef09b07ac0] mmco: unref short failure
[h264 @ 0x563d799ea700] mmco: unref short failure
[h264 @ 0x563d799ea700] mmco: unref short failure
[h264 @ 0x55ef09b07ac0] mmco: unref short failure
[h264 @ 0x55ef09b07ac0] mmco: unref short failure
[h264 @ 0x563d799ea700] mmco: unref short failure
[h264 @ 0x563d799ea700] mmco: unref short failure
[h264 @ 0x563d799ea700] mmco: unref short failure
[h264 @ 0x55ef09b07ac0] mmco: unref short failure
[h264 @ 0x55ef09b07ac0] mmco: unref short failure
[h264 @ 0x55ef09b07ac0] mmco: unref short failure
[h264 @ 0x563d799ea700] mmco: unref short failure
[h264 @ 0x55ef09b07ac0] mmco: unref short failure
[h264 @ 0x563d79a79e40] mmco: unref short failure
[h264 @ 0x55ef0923b380] mmco: unref short failure
[h264 @ 0x55ef0923b380] mmco: unref short failure
[h264 @ 0x55ef0923b380] mmco: unref short failure
[h264 @ 0x563d79a79e40] mmco: unref short failure
[h264 @ 0x563d79a79e40] mmco: unref short failure
[h264 @ 0x55ef0a169480] mmco: unref short failure
[h264 @ 0x55ef0a169480] mmco: unref short failure
[h264 @ 0x563d79301700] mmco: unref short failure
[h264 @ 0x563d79301700] mmco: unref short failure
[h264 @ 0x563d79a79e40] mmco: unref short failure
[h264 @ 0x55ef0923b380] mmco: unref short failure
[h264 @ 0x563d79a79e40] mmco: unref short failure
[h264 @ 0x55ef0923b380] mmco: unref short failure
[h264 @ 0x55ef0a169480] mmco: unref short failure
[h264 @ 0x563d79301700] mmco: unref short failure
[h264 @ 0x55ef0a169480] mmco: unref short failure
[h264 @ 0x55ef0a169480] mmco: unref short failure
[h264 @ 0x563d79301700] mmco: unref short failure
[h264 @ 0x563d79301700] mmco: unref short failure
[h264 @ 0x563d79a79e40] mmco: unref short failure
[h264 @ 0x563d79a79e40] mmco: unref short failure
[h264 @ 0x55ef0923b380] mmco: unref short failure
[h264 @ 0x55ef0923b380] mmco: unref short failure
[h264 @ 0x55ef113c5d00] mmco: unref short failure
[h264 @ 0x55ef113c5d00] mmco: unref short failure
[h264 @ 0x563d78976080] mmco: unref short failure
[h264 @ 0x563d78976080] mmco: unref short failure
[h264 @ 0x563d78976080] mmco: unref short failure
[h264 @ 0x55ef113c5d00] mmco: unref short failure
[h264 @ 0x563d78976080] mmco: unref short failure
[h264 @ 0x55ef09718280] mmco: unref short failure
[h264 @ 0x563d8113fbc0] mmco: unref short failure
[h264 @ 0x563d8113fbc0] mmco: unref short failure
[h264 @ 0x55ef0c609180] mmco: unref short failure
[h264 @ 0x55ef0c609180] mmco: unref short failure
[h264 @ 0x55ef0c609180] mmco: unref short failure
[h264 @ 0x563d8113fbc0] mmco: unref short failure
[h264 @ 0x55ef0923b380] mmco: unref short failure
[h264 @ 0x55ef0923b380] mmco: unref short failure
[h264 @ 0x563d78965e40] mmco: unref short failure
[h264 @ 0x563d78965e40] mmco: unref short failure
[h264 @ 0x55ef0c609180] mmco: unref short failure
[h264 @ 0x55ef0c609180] mmco: unref short failure
[h264 @ 0x563d8113fbc0] mmco: unref short failure
[h264 @ 0x563d8113fbc0] mmco: unref short failure
[h264 @ 0x563d78965e40] mmco: unref short failure
[h264 @ 0x563d78965e40] mmco: unref short failure
[h264 @ 0x55ef0923b380] mmco: unref short failure
[h264 @ 0x55ef0923b380] mmco: unref short failure
[h264 @ 0x563d78965e40] mmco: unref short failure
[h264 @ 0x563d78965e40] mmco: unref short failure
[h264 @ 0x55ef0923b380] mmco: unref short failure
[h264 @ 0x55ef0923b380] mmco: unref short failure
[h264 @ 0x55ef0c609180] mmco: unref short failure
[h264 @ 0x55ef0c609180] mmco: unref short failure
[h264 @ 0x563d8113fbc0] mmco: unref short failure
[h264 @ 0x563d8113fbc0] mmco: unref short failure
[h264 @ 0x55ef0c609180] mmco: unref short failure
[h264 @ 0x563d8113fbc0] mmco: unref short failure
[h264 @ 0x55ef0c609180] mmco: unref short failure
[h264 @ 0x563d8113fbc0] mmco: unref short failure
[h264 @ 0x563d7909ae00] mmco: unref short failure
[h264 @ 0x55ef098f0800] mmco: unref short failure
[h264 @ 0x55ef0c609180] mmco: unref short failure
[h264 @ 0x55ef0c609180] mmco: unref short failure
[h264 @ 0x563d8113fbc0] mmco: unref short failure
[h264 @ 0x563d8113fbc0] mmco: unref short failure
[h264 @ 0x563d78965e40] mmco: unref short failure
[h264 @ 0x55ef0923b380] mmco: unref short failure
[h264 @ 0x55ef0c609180] mmco: unref short failure
[h264 @ 0x563d8113fbc0] mmco: unref short failure
[h264 @ 0x563d79301700] mmco: unref short failure
[h264 @ 0x55ef0a169480] mmco: unref short failure
[h264 @ 0x563d79301700] mmco: unref short failure
[h264 @ 0x563d79301700] mmco: unref short failure
[h264 @ 0x55ef0a169480] mmco: unref short failure
[h264 @ 0x55ef0a169480] mmco: unref short failure
[h264 @ 0x563d79301700] mmco: unref short failure
[h264 @ 0x55ef0a169480] mmco: unref short failure
[h264 @ 0x55ef098f0800] mmco: unref short failure
[h264 @ 0x563d7909ae00] mmco: unref short failure
[h264 @ 0x563d799ea700] mmco: unref short failure
[h264 @ 0x563d799ea700] mmco: unref short failure
[h264 @ 0x55ef09b07ac0] mmco: unref short failure
[h264 @ 0x55ef09b07ac0] mmco: unref short failure
[h264 @ 0x563d799ea700] mmco: unref short failure
[h264 @ 0x563d799ea700] mmco: unref short failure
[h264 @ 0x55ef09b07ac0] mmco: unref short failure
[h264 @ 0x55ef09b07ac0] mmco: unref short failure
[h264 @ 0x55ef0a1c5980] mmco: unref short failure
[h264 @ 0x563d7a722400] mmco: unref short failure
saving checkpoint at iteration     100 to /data_2/output/LM/scripts/modellink/qwen25/finetune_qwen25_14b_intern_300m_ptd_tp8pp1cp2_stage3.sh/20241127_204213// in torch format
  successfully saved checkpoint at iteration     100 to /data_2/output/LM/scripts/modellink/qwen25/finetune_qwen25_14b_intern_300m_ptd_tp8pp1cp2_stage3.sh/20241127_204213//
[h264 @ 0x55ef098f0800] mmco: unref short failure
[h264 @ 0x563d7909ae00] mmco: unref short failure
[h264 @ 0x563d786b2140] mmco: unref short failure
[h264 @ 0x55ef09718280] mmco: unref short failure
[h264 @ 0x55ef095231c0] mmco: unref short failure
[h264 @ 0x55ef095231c0] mmco: unref short failure
[h264 @ 0x563d797a5f40] mmco: unref short failure
[h264 @ 0x563d797a5f40] mmco: unref short failure
[h264 @ 0x55ef08aa5000] mmco: unref short failure
[h264 @ 0x55ef08aa5000] mmco: unref short failure
[h264 @ 0x563d79b03d00] mmco: unref short failure
[h264 @ 0x563d79b03d00] mmco: unref short failure
[h264 @ 0x55ef08aa5000] mmco: unref short failure
[h264 @ 0x563d79b03d00] mmco: unref short failure
[h264 @ 0x55ef08aa5000] mmco: unref short failure
[h264 @ 0x55ef08aa5000] mmco: unref short failure
[h264 @ 0x563d79b03d00] mmco: unref short failure
[h264 @ 0x563d79b03d00] mmco: unref short failure
[h264 @ 0x563d79b03d00] mmco: unref short failure
[h264 @ 0x563d79b03d00] mmco: unref short failure
[h264 @ 0x55ef08aa5000] mmco: unref short failure
[h264 @ 0x55ef08aa5000] mmco: unref short failure
[h264 @ 0x563d79b03d00] mmco: unref short failure
[h264 @ 0x563d79b03d00] mmco: unref short failure
[h264 @ 0x55ef08aa5000] mmco: unref short failure
[h264 @ 0x55ef08aa5000] mmco: unref short failure
[h264 @ 0x563d79301700] mmco: unref short failure
[h264 @ 0x563d79301700] mmco: unref short failure
[h264 @ 0x55ef0a169480] mmco: unref short failure
[h264 @ 0x55ef0a169480] mmco: unref short failure
[h264 @ 0x563d7a6f72c0] mmco: unref short failure
[h264 @ 0x563d7a6f72c0] mmco: unref short failure
[h264 @ 0x55ef07fe90c0] mmco: unref short failure
[h264 @ 0x55ef07fe90c0] mmco: unref short failure
[h264 @ 0x563d7a6f72c0] mmco: unref short failure
[h264 @ 0x55ef07fe90c0] mmco: unref short failure
[h264 @ 0x563d7a6f72c0] mmco: unref short failure
[h264 @ 0x563d7a6f72c0] mmco: unref short failure
[h264 @ 0x55ef07fe90c0] mmco: unref short failure
[h264 @ 0x55ef07fe90c0] mmco: unref short failure
[h264 @ 0x563d7a6f72c0] mmco: unref short failure
[h264 @ 0x563d7a6f72c0] mmco: unref short failure
[h264 @ 0x55ef07fe90c0] mmco: unref short failure
[h264 @ 0x55ef07fe90c0] mmco: unref short failure
[h264 @ 0x563d7a6f72c0] mmco: unref short failure
[h264 @ 0x563d7a6f72c0] mmco: unref short failure
[h264 @ 0x55ef07fe90c0] mmco: unref short failure
[h264 @ 0x55ef07fe90c0] mmco: unref short failure
[h264 @ 0x563d7a6f72c0] mmco: unref short failure
[h264 @ 0x55ef07fe90c0] mmco: unref short failure
[h264 @ 0x563d786b2140] mmco: unref short failure
[h264 @ 0x55ef09718280] mmco: unref short failure
[h264 @ 0x563d786b2140] mmco: unref short failure
[h264 @ 0x55ef09718280] mmco: unref short failure
[h264 @ 0x55ef09718280] mmco: unref short failure
[h264 @ 0x563d786b2140] mmco: unref short failure
[h264 @ 0x563d786b2140] mmco: unref short failure
[h264 @ 0x55ef09718280] mmco: unref short failure
[h264 @ 0x563d786b2140] mmco: unref short failure
[h264 @ 0x563d786b2140] mmco: unref short failure
[h264 @ 0x55ef09718280] mmco: unref short failure
[h264 @ 0x55ef09718280] mmco: unref short failure
[h264 @ 0x55ef09718280] mmco: unref short failure
[h264 @ 0x55ef09718280] mmco: unref short failure
[h264 @ 0x563d786b2140] mmco: unref short failure
[h264 @ 0x563d786b2140] mmco: unref short failure
[h264 @ 0x55ef09718280] mmco: unref short failure
[h264 @ 0x55ef09718280] mmco: unref short failure
[h264 @ 0x563d786b2140] mmco: unref short failure
[h264 @ 0x563d786b2140] mmco: unref short failure
[h264 @ 0x55ef09718280] mmco: unref short failure
[h264 @ 0x55ef09718280] mmco: unref short failure
[h264 @ 0x563d786b2140] mmco: unref short failure
[h264 @ 0x563d786b2140] mmco: unref short failure
[h264 @ 0x55ef09718280] mmco: unref short failure
[h264 @ 0x55ef09718280] mmco: unref short failure
[h264 @ 0x563d786b2140] mmco: unref short failure
[h264 @ 0x563d786b2140] mmco: unref short failure
[h264 @ 0x55ef09718280] mmco: unref short failure
[h264 @ 0x563d786b2140] mmco: unref short failure
[h264 @ 0x55ef08aa5000] mmco: unref short failure
[h264 @ 0x55ef08aa5000] mmco: unref short failure
[h264 @ 0x563d79b03d00] mmco: unref short failure
[h264 @ 0x563d79b03d00] mmco: unref short failure
[h264 @ 0x563d79301700] mmco: unref short failure
[h264 @ 0x563d79301700] mmco: unref short failure
[h264 @ 0x55ef08f34980] mmco: unref short failure
[h264 @ 0x55ef08f34980] mmco: unref short failure
[h264 @ 0x563d79301700] mmco: unref short failure
[h264 @ 0x55ef08f34980] mmco: unref short failure
[h264 @ 0x563d79301700] mmco: unref short failure
[h264 @ 0x563d79301700] mmco: unref short failure
[h264 @ 0x55ef08f34980] mmco: unref short failure
[h264 @ 0x55ef08f34980] mmco: unref short failure
[h264 @ 0x563d79301700] mmco: unref short failure
[h264 @ 0x55ef08f34980] mmco: unref short failure
[h264 @ 0x55ef08aa5000] mmco: unref short failure
[h264 @ 0x55ef08aa5000] mmco: unref short failure
[h264 @ 0x563d79b03d00] mmco: unref short failure
[h264 @ 0x563d79b03d00] mmco: unref short failure
[h264 @ 0x55ef08aa5000] mmco: unref short failure
[h264 @ 0x55ef08aa5000] mmco: unref short failure
[h264 @ 0x563d79b03d00] mmco: unref short failure
[h264 @ 0x563d79b03d00] mmco: unref short failure
[h264 @ 0x55ef08aa5000] mmco: unref short failure
[h264 @ 0x563d79b03d00] mmco: unref short failure
[h264 @ 0x55ef0941a540] mmco: unref short failure
[h264 @ 0x563d79b03d00] mmco: unref short failure
[h264 @ 0x563d79b03d00] mmco: unref short failure
[h264 @ 0x563d79b03d00] mmco: unref short failure
[h264 @ 0x55ef08aa5000] mmco: unref short failure
[h264 @ 0x55ef08aa5000] mmco: unref short failure
[h264 @ 0x55ef09718280] mmco: unref short failure
[h264 @ 0x55ef09718280] mmco: unref short failure
[h264 @ 0x563d786b2140] mmco: unref short failure
[h264 @ 0x563d786b2140] mmco: unref short failure
[h264 @ 0x563d8102b140] mmco: unref short failure
[h264 @ 0x55ef0a763a00] mmco: unref short failure
[h264 @ 0x563d8137ee00] mmco: unref short failure
[h264 @ 0x55ef08434780] mmco: unref short failure
[h264 @ 0x563d79301700] mmco: unref short failure
[h264 @ 0x563d79301700] mmco: unref short failure
[h264 @ 0x55ef0a169480] mmco: unref short failure
[h264 @ 0x55ef0a169480] mmco: unref short failure
[h264 @ 0x563d79301700] mmco: unref short failure
[h264 @ 0x563d79301700] mmco: unref short failure
[h264 @ 0x55ef0a169480] mmco: unref short failure
[h264 @ 0x55ef0a169480] mmco: unref short failure
[h264 @ 0x563d79301700] mmco: unref short failure
[h264 @ 0x55ef0a169480] mmco: unref short failure
[h264 @ 0x563d79301700] mmco: unref short failure
[h264 @ 0x55ef0a169480] mmco: unref short failure
[h264 @ 0x563d79b03d00] mmco: unref short failure
[h264 @ 0x563d79b03d00] mmco: unref short failure
[h264 @ 0x55ef08aa5000] mmco: unref short failure
[h264 @ 0x55ef08aa5000] mmco: unref short failure
[h264 @ 0x563d79b03d00] mmco: unref short failure
[h264 @ 0x563d79b03d00] mmco: unref short failure
[h264 @ 0x55ef08aa5000] mmco: unref short failure
[h264 @ 0x55ef08aa5000] mmco: unref short failure
[h264 @ 0x55ef0d4bc780] mmco: unref short failure
[h264 @ 0x55ef0d4bc780] mmco: unref short failure
[h264 @ 0x563d7fab1ec0] mmco: unref short failure
[h264 @ 0x563d7fab1ec0] mmco: unref short failure
[h264 @ 0x563d79301700] mmco: unref short failure
[h264 @ 0x563d79301700] mmco: unref short failure
[h264 @ 0x55ef08f34980] mmco: unref short failure
[h264 @ 0x55ef08f34980] mmco: unref short failure
[h264 @ 0x55ef08aa5000] mmco: unref short failure
[h264 @ 0x563d79b03d00] mmco: unref short failure
[h264 @ 0x55ef0791b8c0] mmco: unref short failure
[h264 @ 0x55ef0791b8c0] mmco: unref short failure
[h264 @ 0x55ef0791b8c0] mmco: unref short failure
[h264 @ 0x55ef0791b8c0] mmco: unref short failure
[h264 @ 0x563d7fab1ec0] mmco: unref short failure
[h264 @ 0x563d7fab1ec0] mmco: unref short failure
[h264 @ 0x563d7fab1ec0] mmco: unref short failure
[h264 @ 0x563d7fab1ec0] mmco: unref short failure
[h264 @ 0x55ef0a99d440] mmco: unref short failure
[h264 @ 0x55ef0a99d440] mmco: unref short failure
[h264 @ 0x563d799469c0] mmco: unref short failure
[h264 @ 0x563d799469c0] mmco: unref short failure
[h264 @ 0x55ef0a99d440] mmco: unref short failure
[h264 @ 0x55ef0a99d440] mmco: unref short failure
[h264 @ 0x563d799469c0] mmco: unref short failure
[h264 @ 0x563d799469c0] mmco: unref short failure
[h264 @ 0x55ef08efb740] mmco: unref short failure
[h264 @ 0x55ef0a99d440] mmco: unref short failure
[h264 @ 0x563d79b03d00] mmco: unref short failure
[h264 @ 0x563d799469c0] mmco: unref short failure
[h264 @ 0x55ef08efb740] mmco: unref short failure
[h264 @ 0x55ef08efb740] mmco: unref short failure
[h264 @ 0x563d79b03d00] mmco: unref short failure
[h264 @ 0x563d79b03d00] mmco: unref short failure
[h264 @ 0x55ef0a99d440] mmco: unref short failure
[h264 @ 0x55ef0a99d440] mmco: unref short failure
[h264 @ 0x563d799469c0] mmco: unref short failure
[h264 @ 0x563d799469c0] mmco: unref short failure
[h264 @ 0x55ef0a99d440] mmco: unref short failure
[h264 @ 0x55ef0a99d440] mmco: unref short failure
[h264 @ 0x55ef0a99d440] mmco: unref short failure
[h264 @ 0x55ef0a99d440] mmco: unref short failure
[h264 @ 0x563d799469c0] mmco: unref short failure
[h264 @ 0x563d799469c0] mmco: unref short failure
[h264 @ 0x55ef0a99d440] mmco: unref short failure
[h264 @ 0x55ef0a99d440] mmco: unref short failure
[h264 @ 0x563d799469c0] mmco: unref short failure
[h264 @ 0x563d799469c0] mmco: unref short failure
[h264 @ 0x563d799469c0] mmco: unref short failure
[h264 @ 0x563d799469c0] mmco: unref short failure
[h264 @ 0x55ef08efb740] mmco: unref short failure
[h264 @ 0x55ef08efb740] mmco: unref short failure
[h264 @ 0x563d79b03d00] mmco: unref short failure
[h264 @ 0x563d79b03d00] mmco: unref short failure
[h264 @ 0x563d79301700] mmco: unref short failure
[h264 @ 0x55ef08f34980] mmco: unref short failure
[h264 @ 0x563d799469c0] mmco: unref short failure
[h264 @ 0x55ef0a99d440] mmco: unref short failure
[h264 @ 0x563d79b03d00] mmco: unref short failure
[h264 @ 0x55ef08aa5000] mmco: unref short failure
[h264 @ 0x55ef0d69af00] mmco: unref short failure
[h264 @ 0x55ef0d69af00] mmco: unref short failure
[h264 @ 0x563d7aa4a500] mmco: unref short failure
[h264 @ 0x563d7aa4a500] mmco: unref short failure
[h264 @ 0x55ef08aa5000] mmco: unref short failure
[h264 @ 0x55ef08aa5000] mmco: unref short failure
[h264 @ 0x563d79b03d00] mmco: unref short failure
[h264 @ 0x563d79b03d00] mmco: unref short failure
[h264 @ 0x55ef08aa5000] mmco: unref short failure
[h264 @ 0x55ef08aa5000] mmco: unref short failure
[h264 @ 0x55ef08aa5000] mmco: unref short failure
[h264 @ 0x55ef08aa5000] mmco: unref short failure
[h264 @ 0x563d79b03d00] mmco: unref short failure
[h264 @ 0x563d79b03d00] mmco: unref short failure
[h264 @ 0x563d79b03d00] mmco: unref short failure
[h264 @ 0x563d79b03d00] mmco: unref short failure
[h264 @ 0x55ef08aa5000] mmco: unref short failure
[h264 @ 0x563d79b03d00] mmco: unref short failure
[h264 @ 0x55ef08aa5000] mmco: unref short failure
[h264 @ 0x55ef08aa5000] mmco: unref short failure
[h264 @ 0x55ef08aa5000] mmco: unref short failure
[h264 @ 0x55ef08aa5000] mmco: unref short failure
[h264 @ 0x55ef08aa5000] mmco: unref short failure
[h264 @ 0x563d79b03d00] mmco: unref short failure
[h264 @ 0x563d79b03d00] mmco: unref short failure
[h264 @ 0x563d79b03d00] mmco: unref short failure
[h264 @ 0x563d79b03d00] mmco: unref short failure
[h264 @ 0x563d79b03d00] mmco: unref short failure
[h264 @ 0x55ef08aa5000] mmco: unref short failure
[h264 @ 0x563d79b03d00] mmco: unref short failure
[h264 @ 0x55ef0941a540] mmco: unref short failure
[h264 @ 0x55ef0941a540] mmco: unref short failure
[h264 @ 0x563d79b03d00] mmco: unref short failure
[h264 @ 0x563d79b03d00] mmco: unref short failure
[h264 @ 0x55ef0941a540] mmco: unref short failure
[h264 @ 0x55ef0941a540] mmco: unref short failure
[h264 @ 0x55ef0941a540] mmco: unref short failure
[h264 @ 0x563d79b03d00] mmco: unref short failure
[h264 @ 0x563d79b03d00] mmco: unref short failure
[h264 @ 0x563d79b03d00] mmco: unref short failure
[h264 @ 0x55ef0a99d440] mmco: unref short failure
[h264 @ 0x563d799469c0] mmco: unref short failure
[h264 @ 0x55ef08efb740] mmco: unref short failure
[h264 @ 0x563d79b03d00] mmco: unref short failure
[h264 @ 0x55ef08efb740] mmco: unref short failure
[h264 @ 0x55ef08efb740] mmco: unref short failure
[h264 @ 0x563d79b03d00] mmco: unref short failure
[h264 @ 0x563d79b03d00] mmco: unref short failure
[h264 @ 0x55ef08efb740] mmco: unref short failure
[h264 @ 0x55ef08efb740] mmco: unref short failure
[h264 @ 0x563d79b03d00] mmco: unref short failure
[h264 @ 0x563d79b03d00] mmco: unref short failure
[h264 @ 0x55ef08efb740] mmco: unref short failure
[h264 @ 0x55ef08efb740] mmco: unref short failure
[h264 @ 0x563d79b03d00] mmco: unref short failure
[h264 @ 0x563d79b03d00] mmco: unref short failure
[h264 @ 0x55ef07f7c280] mmco: unref short failure
[h264 @ 0x563d7973ac40] mmco: unref short failure
[h264 @ 0x55ef08aa5000] mmco: unref short failure
[h264 @ 0x563d79b03d00] mmco: unref short failure
[h264 @ 0x563d7973ac40] mmco: unref short failure
[h264 @ 0x55ef07f7c280] mmco: unref short failure
[h264 @ 0x55ef08aa5000] mmco: unref short failure
[h264 @ 0x55ef08aa5000] mmco: unref short failure
[h264 @ 0x563d79b03d00] mmco: unref short failure
[h264 @ 0x563d79b03d00] mmco: unref short failure
[h264 @ 0x55ef096f1180] mmco: unref short failure
[h264 @ 0x55ef096f1180] mmco: unref short failure
[h264 @ 0x563d794bc940] mmco: unref short failure
[h264 @ 0x563d794bc940] mmco: unref short failure
[h264 @ 0x55ef096f1180] mmco: unref short failure
[h264 @ 0x55ef096f1180] mmco: unref short failure
[h264 @ 0x563d794bc940] mmco: unref short failure
[h264 @ 0x563d794bc940] mmco: unref short failure
[h264 @ 0x55ef08f34980] mmco: unref short failure
[h264 @ 0x55ef08f34980] mmco: unref short failure
[h264 @ 0x563d79301700] mmco: unref short failure
[h264 @ 0x563d79301700] mmco: unref short failure
[h264 @ 0x55ef08f34980] mmco: unref short failure
[h264 @ 0x55ef08f34980] mmco: unref short failure
[h264 @ 0x563d79301700] mmco: unref short failure
[h264 @ 0x563d79301700] mmco: unref short failure
[h264 @ 0x55ef08efc800] mmco: unref short failure
[h264 @ 0x55ef08f34980] mmco: unref short failure
[h264 @ 0x563d79430500] mmco: unref short failure
[h264 @ 0x55ef08f34980] mmco: unref short failure
[h264 @ 0x55ef07f7c280] mmco: unref short failure
[h264 @ 0x563d79301700] mmco: unref short failure
[h264 @ 0x55ef07f7c280] mmco: unref short failure
[h264 @ 0x55ef07f7c280] mmco: unref short failure
[h264 @ 0x55ef08aa5000] mmco: unref short failure
[h264 @ 0x563d79301700] mmco: unref short failure
[h264 @ 0x563d7973ac40] mmco: unref short failure
[h264 @ 0x55ef07f7c280] mmco: unref short failure
[h264 @ 0x563d79b03d00] mmco: unref short failure
[h264 @ 0x563d7973ac40] mmco: unref short failure
[h264 @ 0x563d7973ac40] mmco: unref short failure
[h264 @ 0x55ef08f34980] mmco: unref short failure
[h264 @ 0x55ef08f34980] mmco: unref short failure
[h264 @ 0x563d7973ac40] mmco: unref short failure
[h264 @ 0x563d79301700] mmco: unref short failure
[h264 @ 0x563d79301700] mmco: unref short failure
[h264 @ 0x55ef096f1180] mmco: unref short failure
[h264 @ 0x55ef096f1180] mmco: unref short failure
[h264 @ 0x563d8113fbc0] mmco: unref short failure
[h264 @ 0x563d8113fbc0] mmco: unref short failure
[h264 @ 0x563d79b03d00] mmco: unref short failure
[h264 @ 0x55ef08aa5000] mmco: unref short failure
[h264 @ 0x55ef096f1180] mmco: unref short failure
[h264 @ 0x563d8113fbc0] mmco: unref short failure
[h264 @ 0x55ef096f1180] mmco: unref short failure
[h264 @ 0x55ef096f1180] mmco: unref short failure
[h264 @ 0x563d8113fbc0] mmco: unref short failure
[h264 @ 0x563d8113fbc0] mmco: unref short failure
[h264 @ 0x563d79b03d00] mmco: unref short failure
[h264 @ 0x55ef08aa5000] mmco: unref short failure
[h264 @ 0x563d79b03d00] mmco: unref short failure
[h264 @ 0x563d79b03d00] mmco: unref short failure
[h264 @ 0x55ef08aa5000] mmco: unref short failure
[h264 @ 0x55ef08aa5000] mmco: unref short failure
[h264 @ 0x563d79b03d00] mmco: unref short failure
[h264 @ 0x55ef08aa5000] mmco: unref short failure
[h264 @ 0x563d79b03d00] mmco: unref short failure
[h264 @ 0x55ef08aa5000] mmco: unref short failure
[h264 @ 0x55ef07f7c280] mmco: unref short failure
[h264 @ 0x55ef07f7c280] mmco: unref short failure
[h264 @ 0x563d7973ac40] mmco: unref short failure
[h264 @ 0x563d7973ac40] mmco: unref short failure
[h264 @ 0x563d8113fbc0] mmco: unref short failure
[h264 @ 0x55ef096f1180] mmco: unref short failure
[h264 @ 0x563d8113fbc0] mmco: unref short failure
[h264 @ 0x563d8113fbc0] mmco: unref short failure
[h264 @ 0x55ef096f1180] mmco: unref short failure
[h264 @ 0x55ef096f1180] mmco: unref short failure
[h264 @ 0x563d8113fbc0] mmco: unref short failure
[h264 @ 0x55ef096f1180] mmco: unref short failure
[h264 @ 0x563d8113fbc0] mmco: unref short failure
[h264 @ 0x563d8113fbc0] mmco: unref short failure
[h264 @ 0x55ef096f1180] mmco: unref short failure
[h264 @ 0x55ef096f1180] mmco: unref short failure
[h264 @ 0x563d8113fbc0] mmco: unref short failure
[h264 @ 0x55ef096f1180] mmco: unref short failure
[h264 @ 0x563d8113fbc0] mmco: unref short failure
[h264 @ 0x55ef096f1180] mmco: unref short failure
[h264 @ 0x55ef0d4bc780] mmco: unref short failure
[h264 @ 0x563d7933e200] mmco: unref short failure
[h264 @ 0x563d8113fbc0] mmco: unref short failure
[h264 @ 0x563d8113fbc0] mmco: unref short failure
[h264 @ 0x55ef096f1180] mmco: unref short failure
[h264 @ 0x55ef096f1180] mmco: unref short failure
[h264 @ 0x55ef096f1180] mmco: unref short failure
[h264 @ 0x563d8113fbc0] mmco: unref short failure
[h264 @ 0x55ef096f1180] mmco: unref short failure
[h264 @ 0x55ef096f1180] mmco: unref short failure
[h264 @ 0x563d8113fbc0] mmco: unref short failure
[h264 @ 0x563d8113fbc0] mmco: unref short failure
[h264 @ 0x55ef096f1180] mmco: unref short failure
[h264 @ 0x563d8113fbc0] mmco: unref short failure
[h264 @ 0x55ef096f1180] mmco: unref short failure
[h264 @ 0x55ef096f1180] mmco: unref short failure
[h264 @ 0x563d8113fbc0] mmco: unref short failure
[h264 @ 0x563d8113fbc0] mmco: unref short failure
[h264 @ 0x55ef096f1180] mmco: unref short failure
[h264 @ 0x55ef096f1180] mmco: unref short failure
[h264 @ 0x563d8113fbc0] mmco: unref short failure
[h264 @ 0x563d8113fbc0] mmco: unref short failure
[h264 @ 0x55ef096f1180] mmco: unref short failure
[h264 @ 0x55ef096f1180] mmco: unref short failure
[h264 @ 0x563d8113fbc0] mmco: unref short failure
[h264 @ 0x563d8113fbc0] mmco: unref short failure
[h264 @ 0x55ef098f0800] mmco: unref short failure
[h264 @ 0x563d7af1d480] mmco: unref short failure
[h264 @ 0x55ef096f1180] mmco: unref short failure
[h264 @ 0x563d8113fbc0] mmco: unref short failure
[h264 @ 0x55ef096f1180] mmco: unref short failure
[h264 @ 0x55ef096f1180] mmco: unref short failure
[h264 @ 0x563d8113fbc0] mmco: unref short failure
[h264 @ 0x563d8113fbc0] mmco: unref short failure
[h264 @ 0x563d8113fbc0] mmco: unref short failure
[h264 @ 0x563d8113fbc0] mmco: unref short failure
[h264 @ 0x55ef0a99d440] mmco: unref short failure
[h264 @ 0x55ef0a99d440] mmco: unref short failure
[h264 @ 0x563d8113fbc0] mmco: unref short failure
[h264 @ 0x563d8113fbc0] mmco: unref short failure
[h264 @ 0x563d8113fbc0] mmco: unref short failure
[h264 @ 0x55ef0a99d440] mmco: unref short failure
[h264 @ 0x55ef0a99d440] mmco: unref short failure
[h264 @ 0x55ef0a99d440] mmco: unref short failure
[h264 @ 0x55ef0ca4c2c0] mmco: unref short failure
[h264 @ 0x563d7e66b540] mmco: unref short failure
[h264 @ 0x55ef0a871380] mmco: unref short failure
[h264 @ 0x563d7973ac40] mmco: unref short failure
[h264 @ 0x563d7de7dd00] mmco: unref short failure
[h264 @ 0x563d7de7dd00] mmco: unref short failure
[h264 @ 0x55ef08630300] mmco: unref short failure
[h264 @ 0x55ef08630300] mmco: unref short failure
[h264 @ 0x563d7de7dd00] mmco: unref short failure
[h264 @ 0x563d7de7dd00] mmco: unref short failure
[h264 @ 0x55ef08630300] mmco: unref short failure
[h264 @ 0x55ef08630300] mmco: unref short failure
[h264 @ 0x563d790f2bc0] mmco: unref short failure
[h264 @ 0x55ef095231c0] mmco: unref short failure
[h264 @ 0x563d7e66b540] mmco: unref short failure
[h264 @ 0x563d7e66b540] mmco: unref short failure
[h264 @ 0x55ef0ca4c2c0] mmco: unref short failure
[h264 @ 0x55ef0ca4c2c0] mmco: unref short failure
[h264 @ 0x563d7e66b540] mmco: unref short failure
[h264 @ 0x55ef0ca4c2c0] mmco: unref short failure
[h264 @ 0x55ef0ca4c2c0] mmco: unref short failure
[h264 @ 0x563d7e66b540] mmco: unref short failure
[h264 @ 0x563d8113fbc0] mmco: unref short failure
[h264 @ 0x563d8113fbc0] mmco: unref short failure
[h264 @ 0x55ef0a99d440] mmco: unref short failure
[h264 @ 0x55ef0a99d440] mmco: unref short failure
[h264 @ 0x55ef098f0800] mmco: unref short failure
[h264 @ 0x55ef098f0800] mmco: unref short failure
[h264 @ 0x55ef098f0800] mmco: unref short failure
[h264 @ 0x563d7af1d480] mmco: unref short failure
[h264 @ 0x563d7af1d480] mmco: unref short failure
[h264 @ 0x563d7af1d480] mmco: unref short failure
[h264 @ 0x563d79b720c0] mmco: unref short failure
[h264 @ 0x55ef0a99d440] mmco: unref short failure
[h264 @ 0x563d79b720c0] mmco: unref short failure
[h264 @ 0x55ef0a99d440] mmco: unref short failure
[h264 @ 0x55ef0a99d440] mmco: unref short failure
[h264 @ 0x563d8113fbc0] mmco: unref short failure
[h264 @ 0x55ef0a99d440] mmco: unref short failure
[h264 @ 0x55ef0a99d440] mmco: unref short failure
[h264 @ 0x55ef0a99d440] mmco: unref short failure
[h264 @ 0x55ef0a99d440] mmco: unref short failure
[h264 @ 0x55ef0a99d440] mmco: unref short failure
[h264 @ 0x563d8113fbc0] mmco: unref short failure
[h264 @ 0x563d8113fbc0] mmco: unref short failure
[h264 @ 0x563d8113fbc0] mmco: unref short failure
[h264 @ 0x563d8113fbc0] mmco: unref short failure
[h264 @ 0x563d8113fbc0] mmco: unref short failure
[h264 @ 0x55ef0a99d440] mmco: unref short failure
[h264 @ 0x563d8113fbc0] mmco: unref short failure
[h264 @ 0x55ef09fb8c00] mmco: unref short failure
[h264 @ 0x563d7945b680] mmco: unref short failure
[h264 @ 0x563d8113fbc0] mmco: unref short failure
[h264 @ 0x563d8113fbc0] mmco: unref short failure
[h264 @ 0x55ef0a99d440] mmco: unref short failure
[h264 @ 0x55ef0a99d440] mmco: unref short failure
[h264 @ 0x55ef0a99d440] mmco: unref short failure
[h264 @ 0x563d8113fbc0] mmco: unref short failure
[h264 @ 0x55ef0a99d440] mmco: unref short failure
[h264 @ 0x55ef0a99d440] mmco: unref short failure
[h264 @ 0x563d8113fbc0] mmco: unref short failure
[h264 @ 0x563d8113fbc0] mmco: unref short failure
[h264 @ 0x55ef09fb8c00] mmco: unref short failure
[h264 @ 0x563d7945b680] mmco: unref short failure
[h264 @ 0x55ef09fb8c00] mmco: unref short failure
[h264 @ 0x563d7945b680] mmco: unref short failure
[h264 @ 0x55ef098f0800] mmco: unref short failure
[h264 @ 0x55ef098f0800] mmco: unref short failure
[h264 @ 0x563d79138840] mmco: unref short failure
[h264 @ 0x563d79138840] mmco: unref short failure
[h264 @ 0x563d7f409000] mmco: unref short failure
[h264 @ 0x55ef082db000] mmco: unref short failure
[h264 @ 0x563d7f409000] mmco: unref short failure
[h264 @ 0x55ef082db000] mmco: unref short failure
[h264 @ 0x563d7f409000] mmco: unref short failure
[h264 @ 0x563d7f409000] mmco: unref short failure
[h264 @ 0x55ef082db000] mmco: unref short failure
[h264 @ 0x55ef082db000] mmco: unref short failure
[h264 @ 0x55ef098f0800] mmco: unref short failure
[h264 @ 0x563d79138840] mmco: unref short failure
[h264 @ 0x55ef098f0800] mmco: unref short failure
[h264 @ 0x55ef098f0800] mmco: unref short failure
[h264 @ 0x563d79138840] mmco: unref short failure
[h264 @ 0x563d79138840] mmco: unref short failure
[h264 @ 0x55ef08f34980] mmco: unref short failure
[h264 @ 0x55ef08f34980] mmco: unref short failure
[h264 @ 0x563d79301700] mmco: unref short failure
[h264 @ 0x563d79301700] mmco: unref short failure
[h264 @ 0x55ef098f0800] mmco: unref short failure
[h264 @ 0x55ef098f0800] mmco: unref short failure
[h264 @ 0x563d79138840] mmco: unref short failure
[h264 @ 0x563d79138840] mmco: unref short failure
[h264 @ 0x55ef098f0800] mmco: unref short failure
[h264 @ 0x563d79138840] mmco: unref short failure
[h264 @ 0x55ef098f0800] mmco: unref short failure
[h264 @ 0x563d79138840] mmco: unref short failure
[h264 @ 0x55ef08f34980] mmco: unref short failure
[h264 @ 0x563d79301700] mmco: unref short failure
[h264 @ 0x55ef08f34980] mmco: unref short failure
[h264 @ 0x563d79301700] mmco: unref short failure
[h264 @ 0x55ef08f34980] mmco: unref short failure
[h264 @ 0x563d79301700] mmco: unref short failure
[h264 @ 0x55ef09486000] mmco: unref short failure
[h264 @ 0x55ef09486000] mmco: unref short failure
[h264 @ 0x563d79b720c0] mmco: unref short failure
[h264 @ 0x563d79b720c0] mmco: unref short failure
[h264 @ 0x55ef09486000] mmco: unref short failure
[h264 @ 0x563d79b720c0] mmco: unref short failure
processed_samples 500 unjoint_samples 500 joint_samples 34 [105813, 125764]
processed_samples 500 unjoint_samples 500 joint_samples 32 [112016, 108623]
processed_samples 500 unjoint_samples 500 joint_samples 34 [105813, 125764]
processed_samples 500 unjoint_samples 500 joint_samples 34 [130043, 58386]
processed_samples 500 unjoint_samples 500 joint_samples 32 [112016, 108623]
processed_samples 500 unjoint_samples 500 joint_samples 34 [130043, 58386]
processed_samples 500 unjoint_samples 500 joint_samples 33 [53100, 129317]
processed_samples 500 unjoint_samples 500 joint_samples 33 [53100, 129317]
processed_samples 500 unjoint_samples 500 joint_samples 31 [112490, 118404]
processed_samples 500 unjoint_samples 500 joint_samples 31 [112490, 118404]
processed_samples 500 unjoint_samples 500 joint_samples 35 [114617, 91626]
processed_samples 500 unjoint_samples 500 joint_samples 35 [114617, 91626]
[h264 @ 0x563d7e66b540] mmco: unref short failure
[h264 @ 0x55ef0ca4c2c0] mmco: unref short failure
[h264 @ 0x55ef0923b380] mmco: unref short failure
[h264 @ 0x55ef0923b380] mmco: unref short failure
[h264 @ 0x563d79b720c0] mmco: unref short failure
[h264 @ 0x563d79b720c0] mmco: unref short failure
[h264 @ 0x55ef0923b380] mmco: unref short failure
[h264 @ 0x563d79b720c0] mmco: unref short failure
[h264 @ 0x55ef0923b380] mmco: unref short failure
[h264 @ 0x563d79b720c0] mmco: unref short failure
[h264 @ 0x55ef0923b380] mmco: unref short failure
[h264 @ 0x563d79b720c0] mmco: unref short failure
processed_samples 500 unjoint_samples 500 joint_samples 33 [114594, 126569]
processed_samples 500 unjoint_samples 500 joint_samples 33 [114594, 126569]
[h264 @ 0x55ef0923b380] mmco: unref short failure
[h264 @ 0x55ef0923b380] mmco: unref short failure
[h264 @ 0x563d79b720c0] mmco: unref short failure
[h264 @ 0x563d79b720c0] mmco: unref short failure
processed_samples 501 unjoint_samples 500 joint_samples 33 [116777, 82863]
processed_samples 501 unjoint_samples 500 joint_samples 33 [116777, 82863]
[h264 @ 0x55ef0ca4c2c0] mmco: unref short failure
[h264 @ 0x563d7e66b540] mmco: unref short failure
[h264 @ 0x55ef0842e940] mmco: unref short failure
[h264 @ 0x55ef0842e940] mmco: unref short failure
[h264 @ 0x563d79138840] mmco: unref short failure
[h264 @ 0x563d79138840] mmco: unref short failure
[h264 @ 0x563d7e66b540] mmco: unref short failure
[h264 @ 0x55ef0ca4c2c0] mmco: unref short failure
[h264 @ 0x563d7e66b540] mmco: unref short failure
[h264 @ 0x563d7e66b540] mmco: unref short failure
[h264 @ 0x55ef0ca4c2c0] mmco: unref short failure
[h264 @ 0x55ef0ca4c2c0] mmco: unref short failure
[h264 @ 0x55ef0941a540] mmco: unref short failure
[h264 @ 0x563d7e66b540] mmco: unref short failure
[h264 @ 0x563d7e66b540] mmco: unref short failure
[h264 @ 0x55ef0ca4c2c0] mmco: unref short failure
[h264 @ 0x55ef0ca4c2c0] mmco: unref short failure
[h264 @ 0x563d7a391000] mmco: unref short failure
[h264 @ 0x55ef0941a540] mmco: unref short failure
[h264 @ 0x563d7a391000] mmco: unref short failure
[h264 @ 0x55ef09fb8c00] mmco: unref short failure
[h264 @ 0x563d78b74a80] mmco: unref short failure
[h264 @ 0x55ef0923b380] mmco: unref short failure
[h264 @ 0x563d79b720c0] mmco: unref short failure
[h264 @ 0x55ef0923b380] mmco: unref short failure
[h264 @ 0x55ef0923b380] mmco: unref short failure
[h264 @ 0x563d79b720c0] mmco: unref short failure
[h264 @ 0x563d79b720c0] mmco: unref short failure
[h264 @ 0x55ef0ca4c2c0] mmco: unref short failure
[h264 @ 0x55ef0ca4c2c0] mmco: unref short failure
[h264 @ 0x563d7e66b540] mmco: unref short failure
[h264 @ 0x55ef0923b380] mmco: unref short failure
[h264 @ 0x55ef0923b380] mmco: unref short failure
[h264 @ 0x563d7e66b540] mmco: unref short failure
[h264 @ 0x55ef0ca4c2c0] mmco: unref short failure
[h264 @ 0x55ef0ca4c2c0] mmco: unref short failure
[h264 @ 0x563d79b720c0] mmco: unref short failure
[h264 @ 0x563d79b720c0] mmco: unref short failure
[h264 @ 0x563d7e66b540] mmco: unref short failure
[h264 @ 0x563d7e66b540] mmco: unref short failure
[h264 @ 0x55ef0ca4c2c0] mmco: unref short failure
[h264 @ 0x55ef0ca4c2c0] mmco: unref short failure
[h264 @ 0x563d7e66b540] mmco: unref short failure
[h264 @ 0x563d7e66b540] mmco: unref short failure
[h264 @ 0x55ef0ca4c2c0] mmco: unref short failure
[h264 @ 0x563d7e66b540] mmco: unref short failure
[h264 @ 0x55ef0ca4c2c0] mmco: unref short failure
[h264 @ 0x563d7e66b540] mmco: unref short failure
[h264 @ 0x55ef0ca4c2c0] mmco: unref short failure
[h264 @ 0x55ef0ca4c2c0] mmco: unref short failure
[h264 @ 0x563d7e66b540] mmco: unref short failure
[h264 @ 0x563d7e66b540] mmco: unref short failure
[h264 @ 0x55ef0ca4c2c0] mmco: unref short failure
[h264 @ 0x563d7e66b540] mmco: unref short failure
[h264 @ 0x55ef09486000] mmco: unref short failure
[h264 @ 0x563d79b720c0] mmco: unref short failure
[h264 @ 0x55ef0d4bc780] mmco: unref short failure
[h264 @ 0x563d7fab1ec0] mmco: unref short failure
[h264 @ 0x55ef0d4bc780] mmco: unref short failure
[h264 @ 0x563d7fab1ec0] mmco: unref short failure
[h264 @ 0x55ef0a871380] mmco: unref short failure
[h264 @ 0x55ef0a871380] mmco: unref short failure
[h264 @ 0x563d7969f2c0] mmco: unref short failure
[h264 @ 0x563d7969f2c0] mmco: unref short failure
[h264 @ 0x55ef0a871380] mmco: unref short failure
[h264 @ 0x55ef0a871380] mmco: unref short failure
[h264 @ 0x563d7857cd00] mmco: unref short failure
[h264 @ 0x563d7857cd00] mmco: unref short failure
[h264 @ 0x55ef0941a540] mmco: unref short failure
[h264 @ 0x55ef0941a540] mmco: unref short failure
[h264 @ 0x563d7a391000] mmco: unref short failure
[h264 @ 0x563d7a391000] mmco: unref short failure
[h264 @ 0x55ef0941a540] mmco: unref short failure
[h264 @ 0x563d7a391000] mmco: unref short failure
[h264 @ 0x55ef08efb740] mmco: unref short failure
[h264 @ 0x55ef08efb740] mmco: unref short failure
[h264 @ 0x563d8113fbc0] mmco: unref short failure
[h264 @ 0x563d8113fbc0] mmco: unref short failure
[h264 @ 0x55ef09486000] mmco: unref short failure
[h264 @ 0x563d79b720c0] mmco: unref short failure
[h264 @ 0x55ef0941a540] mmco: unref short failure
[h264 @ 0x55ef0941a540] mmco: unref short failure
[h264 @ 0x563d78b74a80] mmco: unref short failure
[h264 @ 0x563d78b74a80] mmco: unref short failure
[h264 @ 0x55ef08efb740] mmco: unref short failure
[h264 @ 0x55ef08efb740] mmco: unref short failure
[h264 @ 0x563d8113fbc0] mmco: unref short failure
[h264 @ 0x563d8113fbc0] mmco: unref short failure
[h264 @ 0x55ef08efb740] mmco: unref short failure
[h264 @ 0x563d8113fbc0] mmco: unref short failure
[h264 @ 0x55ef0842e940] mmco: unref short failure
[h264 @ 0x563d79138840] mmco: unref short failure
[h264 @ 0x55ef0842e940] mmco: unref short failure
[h264 @ 0x563d79138840] mmco: unref short failure
[h264 @ 0x55ef0842e940] mmco: unref short failure
[h264 @ 0x563d79138840] mmco: unref short failure
[h264 @ 0x563d79b720c0] mmco: unref short failure
[h264 @ 0x563d79b720c0] mmco: unref short failure
[h264 @ 0x55ef09486000] mmco: unref short failure
[h264 @ 0x55ef09486000] mmco: unref short failure
[h264 @ 0x563d7944a640] mmco: unref short failure
[h264 @ 0x55ef09fb8c00] mmco: unref short failure
[h264 @ 0x563d7944a640] mmco: unref short failure
[h264 @ 0x563d7944a640] mmco: unref short failure
[h264 @ 0x55ef09fb8c00] mmco: unref short failure
[h264 @ 0x55ef09fb8c00] mmco: unref short failure
[h264 @ 0x563d79b720c0] mmco: unref short failure
[h264 @ 0x55ef09486000] mmco: unref short failure
[h264 @ 0x563d7944a640] mmco: unref short failure
[h264 @ 0x55ef09fb8c00] mmco: unref short failure
[h264 @ 0x563d7944a640] mmco: unref short failure
[h264 @ 0x563d7944a640] mmco: unref short failure
[h264 @ 0x55ef09fb8c00] mmco: unref short failure
[h264 @ 0x55ef09fb8c00] mmco: unref short failure
[h264 @ 0x563d8113fbc0] mmco: unref short failure
[h264 @ 0x55ef082d0700] mmco: unref short failure
[h264 @ 0x563d8113fbc0] mmco: unref short failure
[h264 @ 0x563d8113fbc0] mmco: unref short failure
[h264 @ 0x55ef082d0700] mmco: unref short failure
[h264 @ 0x55ef082d0700] mmco: unref short failure
[h264 @ 0x563d8113fbc0] mmco: unref short failure
[h264 @ 0x563d8113fbc0] mmco: unref short failure
[h264 @ 0x55ef082d0700] mmco: unref short failure
[h264 @ 0x55ef082d0700] mmco: unref short failure
[h264 @ 0x563d8113fbc0] mmco: unref short failure
[h264 @ 0x563d8113fbc0] mmco: unref short failure
[h264 @ 0x55ef082d0700] mmco: unref short failure
[h264 @ 0x55ef082d0700] mmco: unref short failure
[h264 @ 0x55ef0a871380] mmco: unref short failure
[h264 @ 0x563d7973ac40] mmco: unref short failure
[h264 @ 0x55ef0a871380] mmco: unref short failure
[h264 @ 0x55ef0a871380] mmco: unref short failure
[h264 @ 0x563d7973ac40] mmco: unref short failure
[h264 @ 0x563d7973ac40] mmco: unref short failure
[h264 @ 0x563d7ad55d40] mmco: unref short failure
[h264 @ 0x563d7ad55d40] mmco: unref short failure
[h264 @ 0x55ef08434780] mmco: unref short failure
[h264 @ 0x55ef08434780] mmco: unref short failure
[h264 @ 0x55ef0842e940] mmco: unref short failure
[h264 @ 0x563d79138840] mmco: unref short failure
[h264 @ 0x55ef08aa5000] mmco: unref short failure
[h264 @ 0x563d79c28ac0] mmco: unref short failure
[h264 @ 0x55ef08aa5000] mmco: unref short failure
[h264 @ 0x55ef08aa5000] mmco: unref short failure
[h264 @ 0x563d79c28ac0] mmco: unref short failure
[h264 @ 0x563d79c28ac0] mmco: unref short failure
[h264 @ 0x563d79c28ac0] mmco: unref short failure
[h264 @ 0x563d79c28ac0] mmco: unref short failure
[h264 @ 0x55ef08aa5000] mmco: unref short failure
[h264 @ 0x55ef08aa5000] mmco: unref short failure
[h264 @ 0x55ef09fb8c00] mmco: unref short failure
[h264 @ 0x55ef09fb8c00] mmco: unref short failure
[h264 @ 0x563d7944a640] mmco: unref short failure
[h264 @ 0x563d7944a640] mmco: unref short failure
[h264 @ 0x55ef09fb8c00] mmco: unref short failure
[h264 @ 0x563d7944a640] mmco: unref short failure
[h264 @ 0x563d79c28ac0] mmco: unref short failure
[h264 @ 0x55ef08aa5000] mmco: unref short failure
[h264 @ 0x563d79c28ac0] mmco: unref short failure
[h264 @ 0x55ef08aa5000] mmco: unref short failure
[h264 @ 0x55ef08630300] mmco: unref short failure
[h264 @ 0x55ef08630300] mmco: unref short failure
[h264 @ 0x563d7854ed00] mmco: unref short failure
[h264 @ 0x563d7854ed00] mmco: unref short failure
[h264 @ 0x55ef0941a540] mmco: unref short failure
[h264 @ 0x55ef0941a540] mmco: unref short failure
[h264 @ 0x563d78b74a80] mmco: unref short failure
[h264 @ 0x563d78b74a80] mmco: unref short failure
[h264 @ 0x55ef0941a540] mmco: unref short failure
[h264 @ 0x563d78b74a80] mmco: unref short failure
[h264 @ 0x55ef0941a540] mmco: unref short failure
[h264 @ 0x55ef0941a540] mmco: unref short failure
[h264 @ 0x563d78b74a80] mmco: unref short failure
[h264 @ 0x563d78b74a80] mmco: unref short failure
[h264 @ 0x55ef0941a540] mmco: unref short failure
[h264 @ 0x563d78b74a80] mmco: unref short failure
[h264 @ 0x55ef08aa5000] mmco: unref short failure
[h264 @ 0x563d79c28ac0] mmco: unref short failure
[h264 @ 0x55ef08aa5000] mmco: unref short failure
[h264 @ 0x55ef08aa5000] mmco: unref short failure
[h264 @ 0x563d79c28ac0] mmco: unref short failure
[h264 @ 0x563d79c28ac0] mmco: unref short failure
[h264 @ 0x55ef08aa5000] mmco: unref short failure
[h264 @ 0x55ef08aa5000] mmco: unref short failure
[h264 @ 0x563d79c28ac0] mmco: unref short failure
[h264 @ 0x563d79c28ac0] mmco: unref short failure
[h264 @ 0x55ef08aa5000] mmco: unref short failure
[h264 @ 0x563d79c28ac0] mmco: unref short failure
[h264 @ 0x55ef08aa5000] mmco: unref short failure
[h264 @ 0x55ef08aa5000] mmco: unref short failure
[h264 @ 0x563d79c28ac0] mmco: unref short failure
[h264 @ 0x563d79c28ac0] mmco: unref short failure
[h264 @ 0x55ef08aa5000] mmco: unref short failure
[h264 @ 0x563d79c28ac0] mmco: unref short failure
[h264 @ 0x55ef08aa5000] mmco: unref short failure
[h264 @ 0x563d79c28ac0] mmco: unref short failure
[h264 @ 0x55ef09fb8c00] mmco: unref short failure
[h264 @ 0x55ef09fb8c00] mmco: unref short failure
[h264 @ 0x563d78edf780] mmco: unref short failure
[h264 @ 0x563d78edf780] mmco: unref short failure
[h264 @ 0x55ef0a763a00] mmco: unref short failure
[h264 @ 0x563d7b050400] mmco: unref short failure
[h264 @ 0x55ef0842e940] mmco: unref short failure
[h264 @ 0x563d79138840] mmco: unref short failure
[h264 @ 0x563d7ba87580] mmco: unref short failure
[h264 @ 0x55ef0a763a00] mmco: unref short failure
[h264 @ 0x563d7ba87580] mmco: unref short failure
[h264 @ 0x55ef0a763a00] mmco: unref short failure
[h264 @ 0x55ef09fb8c00] mmco: unref short failure
[h264 @ 0x563d7ab61640] mmco: unref short failure
[h264 @ 0x55ef09cec5c0] mmco: unref short failure
[h264 @ 0x55ef09cec5c0] mmco: unref short failure
[h264 @ 0x563d8137ee00] mmco: unref short failure
[h264 @ 0x563d8137ee00] mmco: unref short failure
[h264 @ 0x55ef08aa5000] mmco: unref short failure
[h264 @ 0x563d79c28ac0] mmco: unref short failure
[h264 @ 0x55ef10b42940] mmco: unref short failure
[h264 @ 0x563d7de7dd00] mmco: unref short failure
[h264 @ 0x563d78edf780] mmco: unref short failure
[h264 @ 0x563d78edf780] mmco: unref short failure
[h264 @ 0x55ef09fb8c00] mmco: unref short failure
[h264 @ 0x55ef09fb8c00] mmco: unref short failure
[h264 @ 0x55ef09cec5c0] mmco: unref short failure
[h264 @ 0x55ef09cec5c0] mmco: unref short failure
[h264 @ 0x563d8137ee00] mmco: unref short failure
[h264 @ 0x55ef10b42940] mmco: unref short failure
[h264 @ 0x563d8137ee00] mmco: unref short failure
[h264 @ 0x55ef096f1180] mmco: unref short failure
[h264 @ 0x55ef096f1180] mmco: unref short failure
[h264 @ 0x563d7de7dd00] mmco: unref short failure
[h264 @ 0x55ef096f1180] mmco: unref short failure
[h264 @ 0x563d791e6cc0] mmco: unref short failure
[h264 @ 0x563d791e6cc0] mmco: unref short failure
[h264 @ 0x563d791e6cc0] mmco: unref short failure
[h264 @ 0x55ef10b42940] mmco: unref short failure
[h264 @ 0x563d7de7dd00] mmco: unref short failure
[h264 @ 0x55ef10b42940] mmco: unref short failure
[h264 @ 0x563d7de7dd00] mmco: unref short failure
[h264 @ 0x55ef10b42940] mmco: unref short failure
[h264 @ 0x55ef10b42940] mmco: unref short failure
[h264 @ 0x563d7de7dd00] mmco: unref short failure
[h264 @ 0x563d7de7dd00] mmco: unref short failure
[h264 @ 0x55ef10b42940] mmco: unref short failure
[h264 @ 0x55ef10b42940] mmco: unref short failure
[h264 @ 0x563d7de7dd00] mmco: unref short failure
[h264 @ 0x563d7de7dd00] mmco: unref short failure
[h264 @ 0x563d7de7dd00] mmco: unref short failure
[h264 @ 0x563d7de7dd00] mmco: unref short failure
[h264 @ 0x55ef10b42940] mmco: unref short failure
[h264 @ 0x55ef10b42940] mmco: unref short failure
[h264 @ 0x563d7de7dd00] mmco: unref short failure
[h264 @ 0x563d7de7dd00] mmco: unref short failure
[h264 @ 0x55ef10b42940] mmco: unref short failure
[h264 @ 0x55ef10b42940] mmco: unref short failure
[h264 @ 0x563d7de7dd00] mmco: unref short failure
[h264 @ 0x563d7de7dd00] mmco: unref short failure
[h264 @ 0x55ef10b42940] mmco: unref short failure
[h264 @ 0x55ef10b42940] mmco: unref short failure
[h264 @ 0x563d7de7dd00] mmco: unref short failure
[h264 @ 0x55ef10b42940] mmco: unref short failure
[h264 @ 0x563d7de7dd00] mmco: unref short failure
[h264 @ 0x563d7de7dd00] mmco: unref short failure
[h264 @ 0x55ef10b42940] mmco: unref short failure
[h264 @ 0x55ef10b42940] mmco: unref short failure
[h264 @ 0x55ef09e96ac0] mmco: unref short failure
[h264 @ 0x55ef09e96ac0] mmco: unref short failure
[h264 @ 0x563d7945b680] mmco: unref short failure
[h264 @ 0x563d7945b680] mmco: unref short failure
[h264 @ 0x55ef093fd000] mmco: unref short failure
[h264 @ 0x563d7973ac40] mmco: unref short failure
[h264 @ 0x55ef0ac5e940] mmco: unref short failure
[h264 @ 0x55ef0ac5e940] mmco: unref short failure
[h264 @ 0x563d8113fbc0] mmco: unref short failure
[h264 @ 0x563d8113fbc0] mmco: unref short failure
[h264 @ 0x55ef0ac5e940] mmco: unref short failure
[h264 @ 0x55ef0ac5e940] mmco: unref short failure
[h264 @ 0x563d8113fbc0] mmco: unref short failure
[h264 @ 0x563d8113fbc0] mmco: unref short failure
[h264 @ 0x55ef0ac5e940] mmco: unref short failure
[h264 @ 0x55ef0ac5e940] mmco: unref short failure
[h264 @ 0x563d8113fbc0] mmco: unref short failure
[h264 @ 0x563d8113fbc0] mmco: unref short failure
[h264 @ 0x55ef0ac5e940] mmco: unref short failure
[h264 @ 0x55ef0ac5e940] mmco: unref short failure
[h264 @ 0x563d8113fbc0] mmco: unref short failure
[h264 @ 0x563d8113fbc0] mmco: unref short failure
[h264 @ 0x563d8113fbc0] mmco: unref short failure
[h264 @ 0x563d8113fbc0] mmco: unref short failure
[h264 @ 0x55ef0ac5e940] mmco: unref short failure
[h264 @ 0x55ef0ac5e940] mmco: unref short failure
[h264 @ 0x55ef0ac5e940] mmco: unref short failure
[h264 @ 0x563d8113fbc0] mmco: unref short failure
[h264 @ 0x55ef0868b9c0] mmco: unref short failure
[h264 @ 0x55ef0868b9c0] mmco: unref short failure
[h264 @ 0x563d7ba87580] mmco: unref short failure
[h264 @ 0x563d7ba87580] mmco: unref short failure
[h264 @ 0x55ef0868b9c0] mmco: unref short failure
[h264 @ 0x563d7ba87580] mmco: unref short failure
[h264 @ 0x55ef0868b9c0] mmco: unref short failure
[h264 @ 0x55ef0868b9c0] mmco: unref short failure
[h264 @ 0x563d7ba87580] mmco: unref short failure
[h264 @ 0x563d7ba87580] mmco: unref short failure
[h264 @ 0x55ef0868b9c0] mmco: unref short failure
[h264 @ 0x55ef0868b9c0] mmco: unref short failure
[h264 @ 0x563d7ba87580] mmco: unref short failure
[h264 @ 0x563d7ba87580] mmco: unref short failure
[h264 @ 0x55ef0a99d440] mmco: unref short failure
[h264 @ 0x55ef0a99d440] mmco: unref short failure
[h264 @ 0x563d799469c0] mmco: unref short failure
[h264 @ 0x563d799469c0] mmco: unref short failure
[h264 @ 0x55ef0a871380] mmco: unref short failure
[h264 @ 0x55ef0a871380] mmco: unref short failure
[h264 @ 0x563d78936cc0] mmco: unref short failure
[h264 @ 0x563d78936cc0] mmco: unref short failure
[h264 @ 0x55ef0868b9c0] mmco: unref short failure
[h264 @ 0x55ef0868b9c0] mmco: unref short failure
[h264 @ 0x563d7ba87580] mmco: unref short failure
[h264 @ 0x563d7ba87580] mmco: unref short failure
[h264 @ 0x55ef0868b9c0] mmco: unref short failure
[h264 @ 0x563d7ba87580] mmco: unref short failure
[h264 @ 0x563d7ba87580] mmco: unref short failure
[h264 @ 0x55ef0868b9c0] mmco: unref short failure
[h264 @ 0x563d7ba87580] mmco: unref short failure
[h264 @ 0x55ef0868b9c0] mmco: unref short failure
[h264 @ 0x563d7ba87580] mmco: unref short failure
[h264 @ 0x563d7ba87580] mmco: unref short failure
[h264 @ 0x55ef0868b9c0] mmco: unref short failure
[h264 @ 0x55ef0868b9c0] mmco: unref short failure
[h264 @ 0x55ef09fb8c00] mmco: unref short failure
[h264 @ 0x563d78edf780] mmco: unref short failure
[h264 @ 0x55ef09fb8c00] mmco: unref short failure
[h264 @ 0x563d78edf780] mmco: unref short failure
[h264 @ 0x55ef09fb8c00] mmco: unref short failure
[h264 @ 0x563d79426840] mmco: unref short failure
[h264 @ 0x55ef08bb1f80] mmco: unref short failure
[h264 @ 0x563d79b03d00] mmco: unref short failure
[h264 @ 0x55ef08bb1f80] mmco: unref short failure
[h264 @ 0x55ef08bb1f80] mmco: unref short failure
[h264 @ 0x563d79b03d00] mmco: unref short failure
[h264 @ 0x563d79b03d00] mmco: unref short failure
[h264 @ 0x55ef08bb1f80] mmco: unref short failure
[h264 @ 0x563d79b03d00] mmco: unref short failure
[h264 @ 0x55ef08bb1f80] mmco: unref short failure
[h264 @ 0x55ef08bb1f80] mmco: unref short failure
[h264 @ 0x563d79b03d00] mmco: unref short failure
[h264 @ 0x563d79b03d00] mmco: unref short failure
[h264 @ 0x55ef0868b9c0] mmco: unref short failure
[h264 @ 0x563d7ba87580] mmco: unref short failure
[h264 @ 0x55ef09718280] mmco: unref short failure
[h264 @ 0x55ef09718280] mmco: unref short failure
[h264 @ 0x563d79b03d00] mmco: unref short failure
[h264 @ 0x563d79b03d00] mmco: unref short failure
[h264 @ 0x563d7ba87580] mmco: unref short failure
[h264 @ 0x563d7ba87580] mmco: unref short failure
[h264 @ 0x55ef0868b9c0] mmco: unref short failure
[h264 @ 0x55ef0868b9c0] mmco: unref short failure
[h264 @ 0x55ef0abb24c0] mmco: unref short failure
[h264 @ 0x563d78b74a80] mmco: unref short failure
[h264 @ 0x563d7ba87580] mmco: unref short failure
[h264 @ 0x563d7ba87580] mmco: unref short failure
[h264 @ 0x55ef0868b9c0] mmco: unref short failure
[h264 @ 0x55ef0868b9c0] mmco: unref short failure
[h264 @ 0x55ef0abb24c0] mmco: unref short failure
[h264 @ 0x563d78b74a80] mmco: unref short failure
[h264 @ 0x563d7ba87580] mmco: unref short failure
[h264 @ 0x563d7ba87580] mmco: unref short failure
[h264 @ 0x55ef0868b9c0] mmco: unref short failure
[h264 @ 0x55ef0868b9c0] mmco: unref short failure
[h264 @ 0x563d7973ac40] mmco: unref short failure
[h264 @ 0x563d7973ac40] mmco: unref short failure
[h264 @ 0x55ef09b07ac0] mmco: unref short failure
[h264 @ 0x55ef09b07ac0] mmco: unref short failure
[h264 @ 0x563d7973ac40] mmco: unref short failure
[h264 @ 0x55ef09b07ac0] mmco: unref short failure
[h264 @ 0x563d7973ac40] mmco: unref short failure
[h264 @ 0x55ef09b07ac0] mmco: unref short failure
[h264 @ 0x563d7973ac40] mmco: unref short failure
[h264 @ 0x563d7973ac40] mmco: unref short failure
[h264 @ 0x55ef09b07ac0] mmco: unref short failure
[h264 @ 0x55ef09b07ac0] mmco: unref short failure
[h264 @ 0x563d7973ac40] mmco: unref short failure
[h264 @ 0x55ef09b07ac0] mmco: unref short failure
[h264 @ 0x55ef09718280] mmco: unref short failure
[h264 @ 0x55ef09718280] mmco: unref short failure
[h264 @ 0x563d79b03d00] mmco: unref short failure
[h264 @ 0x563d79b03d00] mmco: unref short failure
[h264 @ 0x563d78d8bb40] mmco: unref short failure
[h264 @ 0x55ef08bb1f80] mmco: unref short failure
[h264 @ 0x563d78b95d80] mmco: unref short failure
[h264 @ 0x55ef0ac5e940] mmco: unref short failure
[h264 @ 0x55ef0868b9c0] mmco: unref short failure
[h264 @ 0x563d7ba87580] mmco: unref short failure
[h264 @ 0x55ef0868b9c0] mmco: unref short failure
[h264 @ 0x563d7ba87580] mmco: unref short failure
[h264 @ 0x55ef09718280] mmco: unref short failure
[h264 @ 0x55ef09718280] mmco: unref short failure
[h264 @ 0x563d79b03d00] mmco: unref short failure
[h264 @ 0x563d79b03d00] mmco: unref short failure
[h264 @ 0x55ef09b07ac0] mmco: unref short failure
[h264 @ 0x563d7973ac40] mmco: unref short failure
[h264 @ 0x55ef09b07ac0] mmco: unref short failure
[h264 @ 0x55ef09b07ac0] mmco: unref short failure
[h264 @ 0x563d7973ac40] mmco: unref short failure
[h264 @ 0x563d7973ac40] mmco: unref short failure
[h264 @ 0x55ef09718280] mmco: unref short failure
[h264 @ 0x55ef09718280] mmco: unref short failure
[h264 @ 0x563d79b03d00] mmco: unref short failure
[h264 @ 0x563d79b03d00] mmco: unref short failure
[h264 @ 0x563d7909ae00] mmco: unref short failure
[h264 @ 0x55ef08bb1f80] mmco: unref short failure
[h264 @ 0x55ef08bb1f80] mmco: unref short failure
[h264 @ 0x55ef08bb1f80] mmco: unref short failure
[h264 @ 0x563d7909ae00] mmco: unref short failure
[h264 @ 0x563d7909ae00] mmco: unref short failure
[h264 @ 0x563d78b74a80] mmco: unref short failure
[h264 @ 0x55ef0abb24c0] mmco: unref short failure
[h264 @ 0x55ef09718280] mmco: unref short failure
[h264 @ 0x55ef09718280] mmco: unref short failure
[h264 @ 0x563d79b03d00] mmco: unref short failure
[h264 @ 0x563d79b03d00] mmco: unref short failure
[h264 @ 0x55ef0abb24c0] mmco: unref short failure
[h264 @ 0x55ef0abb24c0] mmco: unref short failure
[h264 @ 0x563d78b74a80] mmco: unref short failure
[h264 @ 0x563d78b74a80] mmco: unref short failure
[h264 @ 0x55ef0abb24c0] mmco: unref short failure
[h264 @ 0x55ef0abb24c0] mmco: unref short failure
[h264 @ 0x563d78b74a80] mmco: unref short failure
[h264 @ 0x563d78b74a80] mmco: unref short failure
[h264 @ 0x55ef0abb24c0] mmco: unref short failure
[h264 @ 0x55ef0abb24c0] mmco: unref short failure
[h264 @ 0x563d78b74a80] mmco: unref short failure
[h264 @ 0x563d78b74a80] mmco: unref short failure
[h264 @ 0x55ef0abb24c0] mmco: unref short failure
[h264 @ 0x563d78b74a80] mmco: unref short failure
[h264 @ 0x55ef0ac5e940] mmco: unref short failure
[h264 @ 0x55ef0ac5e940] mmco: unref short failure
[h264 @ 0x563d78b95d80] mmco: unref short failure
[h264 @ 0x563d78b95d80] mmco: unref short failure
[h264 @ 0x55ef0ac5e940] mmco: unref short failure
[h264 @ 0x55ef0abb24c0] mmco: unref short failure
[h264 @ 0x55ef0abb24c0] mmco: unref short failure
[h264 @ 0x563d78b74a80] mmco: unref short failure
[h264 @ 0x563d78b74a80] mmco: unref short failure
[h264 @ 0x563d78b95d80] mmco: unref short failure
[h264 @ 0x55ef0ac5e940] mmco: unref short failure
[h264 @ 0x55ef0ac5e940] mmco: unref short failure
[h264 @ 0x563d78b95d80] mmco: unref short failure
[h264 @ 0x563d78b95d80] mmco: unref short failure
[h264 @ 0x55ef08bb1f80] mmco: unref short failure
[h264 @ 0x55ef08bb1f80] mmco: unref short failure
[h264 @ 0x563d7909ae00] mmco: unref short failure
[h264 @ 0x563d7909ae00] mmco: unref short failure
[h264 @ 0x55ef08bb1f80] mmco: unref short failure
[h264 @ 0x55ef08bb1f80] mmco: unref short failure
[h264 @ 0x563d7909ae00] mmco: unref short failure
[h264 @ 0x563d7909ae00] mmco: unref short failure
[h264 @ 0x563d78b74a80] mmco: unref short failure
[h264 @ 0x563d78b74a80] mmco: unref short failure
[h264 @ 0x55ef0abb24c0] mmco: unref short failure
[h264 @ 0x55ef0abb24c0] mmco: unref short failure
[h264 @ 0x55ef0ac5e940] mmco: unref short failure
[h264 @ 0x55ef0ac5e940] mmco: unref short failure
[h264 @ 0x55ef08bb1f80] mmco: unref short failure
[h264 @ 0x55ef08bb1f80] mmco: unref short failure
[h264 @ 0x563d78b95d80] mmco: unref short failure
[h264 @ 0x563d78b95d80] mmco: unref short failure
[h264 @ 0x563d7a649840] mmco: unref short failure
[h264 @ 0x563d7a649840] mmco: unref short failure
[h264 @ 0x55ef08bb1f80] mmco: unref short failure
[h264 @ 0x55ef08aa5000] mmco: unref short failure
[h264 @ 0x563d7a649840] mmco: unref short failure
[h264 @ 0x563d79c28ac0] mmco: unref short failure
[h264 @ 0x55ef08bb1f80] mmco: unref short failure
[h264 @ 0x55ef08bb1f80] mmco: unref short failure
[h264 @ 0x563d7909ae00] mmco: unref short failure
[h264 @ 0x563d7909ae00] mmco: unref short failure
[h264 @ 0x55ef08bb1f80] mmco: unref short failure
[h264 @ 0x55ef08bb1f80] mmco: unref short failure
[h264 @ 0x563d7909ae00] mmco: unref short failure
[h264 @ 0x563d7909ae00] mmco: unref short failure
[h264 @ 0x55ef0ac5e940] mmco: unref short failure
[h264 @ 0x55ef0ac5e940] mmco: unref short failure
[h264 @ 0x563d78b95d80] mmco: unref short failure
[h264 @ 0x563d78b95d80] mmco: unref short failure
[h264 @ 0x55ef0abb24c0] mmco: unref short failure
[h264 @ 0x55ef0abb24c0] mmco: unref short failure
[h264 @ 0x55ef0abb24c0] mmco: unref short failure
[h264 @ 0x563d78b74a80] mmco: unref short failure
[h264 @ 0x563d78b74a80] mmco: unref short failure
[h264 @ 0x563d78b74a80] mmco: unref short failure
[h264 @ 0x55ef0abb24c0] mmco: unref short failure
[h264 @ 0x55ef0abb24c0] mmco: unref short failure
[h264 @ 0x563d78b74a80] mmco: unref short failure
[h264 @ 0x563d78b74a80] mmco: unref short failure
[h264 @ 0x55ef0abb24c0] mmco: unref short failure
[h264 @ 0x55ef0abb24c0] mmco: unref short failure
[h264 @ 0x563d78b74a80] mmco: unref short failure
[h264 @ 0x563d78b74a80] mmco: unref short failure
[h264 @ 0x55ef0abb24c0] mmco: unref short failure
[h264 @ 0x563d78b74a80] mmco: unref short failure
[h264 @ 0x55ef09718280] mmco: unref short failure
[h264 @ 0x55ef09718280] mmco: unref short failure
[h264 @ 0x563d79b03d00] mmco: unref short failure
[h264 @ 0x563d79b03d00] mmco: unref short failure
[h264 @ 0x55ef0abb24c0] mmco: unref short failure
[h264 @ 0x563d78b74a80] mmco: unref short failure
[h264 @ 0x55ef0ab31f00] mmco: unref short failure
[h264 @ 0x55ef0ab31f00] mmco: unref short failure
[h264 @ 0x563d7f409000] mmco: unref short failure
[h264 @ 0x563d7f409000] mmco: unref short failure
[h264 @ 0x55ef0c609180] mmco: unref short failure
[h264 @ 0x55ef0c609180] mmco: unref short failure
[h264 @ 0x563d785b03c0] mmco: unref short failure
[h264 @ 0x563d785b03c0] mmco: unref short failure
[h264 @ 0x55ef0c609180] mmco: unref short failure
[h264 @ 0x563d785b03c0] mmco: unref short failure
[h264 @ 0x55ef08aa5000] mmco: unref short failure
[h264 @ 0x55ef08aa5000] mmco: unref short failure
[h264 @ 0x563d79c28ac0] mmco: unref short failure
[h264 @ 0x563d79c28ac0] mmco: unref short failure
[h264 @ 0x55ef08aa5000] mmco: unref short failure
[h264 @ 0x563d79c28ac0] mmco: unref short failure
[h264 @ 0x55ef08aa5000] mmco: unref short failure
[h264 @ 0x563d79c28ac0] mmco: unref short failure
[h264 @ 0x563d79c28ac0] mmco: unref short failure
[h264 @ 0x55ef08aa5000] mmco: unref short failure
[h264 @ 0x563d79c28ac0] mmco: unref short failure
[h264 @ 0x55ef08aa5000] mmco: unref short failure
[h264 @ 0x563d79c28ac0] mmco: unref short failure
[h264 @ 0x563d79c28ac0] mmco: unref short failure
[h264 @ 0x55ef08aa5000] mmco: unref short failure
[h264 @ 0x55ef08aa5000] mmco: unref short failure
[h264 @ 0x55ef0ac5e940] mmco: unref short failure
[h264 @ 0x55ef0ac5e940] mmco: unref short failure
[h264 @ 0x563d78b95d80] mmco: unref short failure
[h264 @ 0x563d78b95d80] mmco: unref short failure
[h264 @ 0x55ef0a394b80] mmco: unref short failure
[h264 @ 0x55ef0a394b80] mmco: unref short failure
[h264 @ 0x563d793dc200] mmco: unref short failure
[h264 @ 0x563d793dc200] mmco: unref short failure
[h264 @ 0x55ef0a394b80] mmco: unref short failure
[h264 @ 0x55ef0a394b80] mmco: unref short failure
[h264 @ 0x563d793dc200] mmco: unref short failure
[h264 @ 0x563d793dc200] mmco: unref short failure
[h264 @ 0x55ef0add9980] mmco: unref short failure
[h264 @ 0x563d793dc200] mmco: unref short failure
[h264 @ 0x55ef09718280] mmco: unref short failure
[h264 @ 0x563d79b03d00] mmco: unref short failure
[h264 @ 0x55ef09718280] mmco: unref short failure
[h264 @ 0x55ef09718280] mmco: unref short failure
[h264 @ 0x563d79b03d00] mmco: unref short failure
[h264 @ 0x563d79b03d00] mmco: unref short failure
[h264 @ 0x563d785b03c0] mmco: unref short failure
[h264 @ 0x563d785b03c0] mmco: unref short failure
[h264 @ 0x55ef0c609180] mmco: unref short failure
[h264 @ 0x55ef0c609180] mmco: unref short failure
[h264 @ 0x55ef09718280] mmco: unref short failure
[h264 @ 0x55ef09718280] mmco: unref short failure
[h264 @ 0x55ef09718280] mmco: unref short failure
[h264 @ 0x563d79b03d00] mmco: unref short failure
[h264 @ 0x563d79b03d00] mmco: unref short failure
[h264 @ 0x563d79b03d00] mmco: unref short failure
[h264 @ 0x55ef09718280] mmco: unref short failure
[h264 @ 0x55ef09718280] mmco: unref short failure
[h264 @ 0x563d79b03d00] mmco: unref short failure
[h264 @ 0x563d79b03d00] mmco: unref short failure
[h264 @ 0x55ef09718280] mmco: unref short failure
[h264 @ 0x55ef09718280] mmco: unref short failure
[h264 @ 0x563d79b03d00] mmco: unref short failure
[h264 @ 0x563d79b03d00] mmco: unref short failure
[h264 @ 0x55ef09718280] mmco: unref short failure
[h264 @ 0x55ef09718280] mmco: unref short failure
[h264 @ 0x563d79b03d00] mmco: unref short failure
[h264 @ 0x563d79b03d00] mmco: unref short failure
[h264 @ 0x55ef09718280] mmco: unref short failure
[h264 @ 0x55ef09718280] mmco: unref short failure
[h264 @ 0x563d79b03d00] mmco: unref short failure
[h264 @ 0x563d79b03d00] mmco: unref short failure
[h264 @ 0x55ef09718280] mmco: unref short failure
[h264 @ 0x55ef09718280] mmco: unref short failure
[h264 @ 0x563d79b03d00] mmco: unref short failure
[h264 @ 0x563d79b03d00] mmco: unref short failure
[h264 @ 0x563d79b03d00] mmco: unref short failure
[h264 @ 0x563d79b03d00] mmco: unref short failure
[h264 @ 0x55ef09718280] mmco: unref short failure
[h264 @ 0x55ef09718280] mmco: unref short failure
[h264 @ 0x55ef0ab31f00] mmco: unref short failure
[h264 @ 0x563d7f409000] mmco: unref short failure
[h264 @ 0x55ef0ab31f00] mmco: unref short failure
[h264 @ 0x563d7f409000] mmco: unref short failure
[h264 @ 0x55ef0ab31f00] mmco: unref short failure
[h264 @ 0x55ef0ab31f00] mmco: unref short failure
[h264 @ 0x563d7f409000] mmco: unref short failure
[h264 @ 0x563d7f409000] mmco: unref short failure
[h264 @ 0x55ef0ab31f00] mmco: unref short failure
[h264 @ 0x563d7f409000] mmco: unref short failure
[h264 @ 0x563d799ea700] mmco: unref short failure
[h264 @ 0x563d799ea700] mmco: unref short failure
[h264 @ 0x55ef08720c80] mmco: unref short failure
[h264 @ 0x55ef08720c80] mmco: unref short failure
[h264 @ 0x55ef08720c80] mmco: unref short failure
[h264 @ 0x55ef08720c80] mmco: unref short failure
[h264 @ 0x563d799ea700] mmco: unref short failure
[h264 @ 0x563d799ea700] mmco: unref short failure
[h264 @ 0x55ef08720c80] mmco: unref short failure
[h264 @ 0x55ef08720c80] mmco: unref short failure
[h264 @ 0x563d799ea700] mmco: unref short failure
[h264 @ 0x563d799ea700] mmco: unref short failure
[h264 @ 0x55ef08720c80] mmco: unref short failure
[h264 @ 0x55ef08720c80] mmco: unref short failure
[h264 @ 0x563d799ea700] mmco: unref short failure
[h264 @ 0x563d799ea700] mmco: unref short failure
[h264 @ 0x55ef08720c80] mmco: unref short failure
[h264 @ 0x55ef08720c80] mmco: unref short failure
[h264 @ 0x563d799ea700] mmco: unref short failure
[h264 @ 0x563d799ea700] mmco: unref short failure
[h264 @ 0x563d799ea700] mmco: unref short failure
[h264 @ 0x563d799ea700] mmco: unref short failure
[h264 @ 0x55ef08720c80] mmco: unref short failure
[h264 @ 0x55ef08720c80] mmco: unref short failure
[h264 @ 0x563d799ea700] mmco: unref short failure
[h264 @ 0x563d799ea700] mmco: unref short failure
[h264 @ 0x55ef08720c80] mmco: unref short failure
[h264 @ 0x55ef08720c80] mmco: unref short failure
[h264 @ 0x563d799ea700] mmco: unref short failure
[h264 @ 0x55ef0a823a00] mmco: unref short failure
[h264 @ 0x55ef08720c80] mmco: unref short failure
[h264 @ 0x563d79b03d00] mmco: unref short failure
[h264 @ 0x55ef0a823a00] mmco: unref short failure
[h264 @ 0x563d79b03d00] mmco: unref short failure
[h264 @ 0x563d793dc200] mmco: unref short failure
[h264 @ 0x563d793dc200] mmco: unref short failure
[h264 @ 0x55ef09fb8c00] mmco: unref short failure
[h264 @ 0x55ef09fb8c00] mmco: unref short failure
[h264 @ 0x55ef0ab31f00] mmco: unref short failure
[h264 @ 0x55ef0ac5e940] mmco: unref short failure
[h264 @ 0x563d7f409000] mmco: unref short failure
[h264 @ 0x563d78b95d80] mmco: unref short failure
[h264 @ 0x55ef09fb8c00] mmco: unref short failure
[h264 @ 0x55ef09fb8c00] mmco: unref short failure
[h264 @ 0x563d793dc200] mmco: unref short failure
[h264 @ 0x563d793dc200] mmco: unref short failure
[h264 @ 0x55ef09fb8c00] mmco: unref short failure
[h264 @ 0x55ef09fb8c00] mmco: unref short failure
[h264 @ 0x563d793dc200] mmco: unref short failure
[h264 @ 0x563d793dc200] mmco: unref short failure
[h264 @ 0x55ef0a823a00] mmco: unref short failure
[h264 @ 0x563d79b03d00] mmco: unref short failure
[h264 @ 0x55ef09fb8c00] mmco: unref short failure
[h264 @ 0x563d793dc200] mmco: unref short failure
[h264 @ 0x55ef0ac5e940] mmco: unref short failure
[h264 @ 0x55ef0ac5e940] mmco: unref short failure
[h264 @ 0x55ef0ac5e940] mmco: unref short failure
[h264 @ 0x55ef0ac5e940] mmco: unref short failure
[h264 @ 0x563d78b95d80] mmco: unref short failure
[h264 @ 0x563d78b95d80] mmco: unref short failure
[h264 @ 0x563d78b95d80] mmco: unref short failure
[h264 @ 0x563d78b95d80] mmco: unref short failure
[h264 @ 0x55ef0ac5e940] mmco: unref short failure
[h264 @ 0x563d78b95d80] mmco: unref short failure
[h264 @ 0x563d7f409000] mmco: unref short failure
[h264 @ 0x55ef0ab31f00] mmco: unref short failure
[h264 @ 0x563d785b03c0] mmco: unref short failure
[h264 @ 0x55ef0c609180] mmco: unref short failure
[h264 @ 0x55ef08bb1f80] mmco: unref short failure
[h264 @ 0x563d7909ae00] mmco: unref short failure
[h264 @ 0x55ef0a823a00] mmco: unref short failure
[h264 @ 0x563d79b03d00] mmco: unref short failure
[h264 @ 0x55ef0a823a00] mmco: unref short failure
[h264 @ 0x55ef0a823a00] mmco: unref short failure
[h264 @ 0x563d79b03d00] mmco: unref short failure
[h264 @ 0x563d79b03d00] mmco: unref short failure
[h264 @ 0x55ef0bbfcdc0] mmco: unref short failure
[h264 @ 0x55ef0bbfcdc0] mmco: unref short failure
[h264 @ 0x563d7945b680] mmco: unref short failure
[h264 @ 0x563d7945b680] mmco: unref short failure
[h264 @ 0x563d79b03d00] mmco: unref short failure
[h264 @ 0x55ef0a823a00] mmco: unref short failure
[h264 @ 0x563d79b03d00] mmco: unref short failure
[h264 @ 0x55ef0a823a00] mmco: unref short failure
[h264 @ 0x563d79b03d00] mmco: unref short failure
[h264 @ 0x563d79b03d00] mmco: unref short failure
[h264 @ 0x55ef0a823a00] mmco: unref short failure
[h264 @ 0x55ef0a823a00] mmco: unref short failure
[h264 @ 0x55ef0bbfcdc0] mmco: unref short failure
[h264 @ 0x563d78e4e240] mmco: unref short failure
[h264 @ 0x55ef0bbfcdc0] mmco: unref short failure
[h264 @ 0x55ef0bbfcdc0] mmco: unref short failure
[h264 @ 0x563d78e4e240] mmco: unref short failure
[h264 @ 0x563d78e4e240] mmco: unref short failure
[h264 @ 0x563d78e4e240] mmco: unref short failure
[h264 @ 0x563d78e4e240] mmco: unref short failure
[h264 @ 0x55ef0abb24c0] mmco: unref short failure
[h264 @ 0x55ef0abb24c0] mmco: unref short failure
[h264 @ 0x563d78e4e240] mmco: unref short failure
[h264 @ 0x563d78e4e240] mmco: unref short failure
[h264 @ 0x55ef0abb24c0] mmco: unref short failure
[h264 @ 0x55ef0abb24c0] mmco: unref short failure
[h264 @ 0x563d79b03d00] mmco: unref short failure
[h264 @ 0x563d79b03d00] mmco: unref short failure
[h264 @ 0x55ef0a823a00] mmco: unref short failure
[h264 @ 0x55ef0a823a00] mmco: unref short failure
[h264 @ 0x563d79b03d00] mmco: unref short failure
[h264 @ 0x563d79b03d00] mmco: unref short failure
[h264 @ 0x55ef0a823a00] mmco: unref short failure
[h264 @ 0x55ef0a823a00] mmco: unref short failure
[h264 @ 0x563d79b03d00] mmco: unref short failure
[h264 @ 0x55ef0a823a00] mmco: unref short failure
[h264 @ 0x55ef0add9980] mmco: unref short failure
[h264 @ 0x55ef0add9980] mmco: unref short failure
[h264 @ 0x563d78e4e240] mmco: unref short failure
[h264 @ 0x563d78e4e240] mmco: unref short failure
[h264 @ 0x55ef08630300] mmco: unref short failure
[h264 @ 0x55ef08630300] mmco: unref short failure
[h264 @ 0x55ef08bb1f80] mmco: unref short failure
[h264 @ 0x563d797a5f40] mmco: unref short failure
[h264 @ 0x563d797a5f40] mmco: unref short failure
[h264 @ 0x563d7909ae00] mmco: unref short failure
[h264 @ 0x55ef08630300] mmco: unref short failure
[h264 @ 0x55ef08630300] mmco: unref short failure
[h264 @ 0x55ef08bb1f80] mmco: unref short failure
[h264 @ 0x55ef08bb1f80] mmco: unref short failure
[h264 @ 0x55ef08630300] mmco: unref short failure
[h264 @ 0x55ef08630300] mmco: unref short failure
[h264 @ 0x563d797a5f40] mmco: unref short failure
[h264 @ 0x563d797a5f40] mmco: unref short failure
[h264 @ 0x55ef08630300] mmco: unref short failure
[h264 @ 0x55ef08630300] mmco: unref short failure
[h264 @ 0x55ef0ac62640] mmco: unref short failure
[h264 @ 0x563d7909ae00] mmco: unref short failure
[h264 @ 0x563d7909ae00] mmco: unref short failure
[h264 @ 0x563d797a5f40] mmco: unref short failure
[h264 @ 0x563d797a5f40] mmco: unref short failure
[h264 @ 0x563d79c28ac0] mmco: unref short failure
[h264 @ 0x563d797a5f40] mmco: unref short failure
[h264 @ 0x563d797a5f40] mmco: unref short failure
[h264 @ 0x55ef08630300] mmco: unref short failure
[h264 @ 0x55ef08630300] mmco: unref short failure
[h264 @ 0x563d797a5f40] mmco: unref short failure
[h264 @ 0x563d797a5f40] mmco: unref short failure
[h264 @ 0x55ef0ab31f00] mmco: unref short failure
[h264 @ 0x563d7a722400] mmco: unref short failure
[h264 @ 0x55ef0ac62640] mmco: unref short failure
[h264 @ 0x563d79c28ac0] mmco: unref short failure
[h264 @ 0x55ef0ac62640] mmco: unref short failure
[h264 @ 0x563d79c28ac0] mmco: unref short failure
[h264 @ 0x55ef0ab31f00] mmco: unref short failure
[h264 @ 0x563d7a722400] mmco: unref short failure
[h264 @ 0x55ef0ab31f00] mmco: unref short failure
[h264 @ 0x55ef0ab31f00] mmco: unref short failure
[h264 @ 0x563d7a722400] mmco: unref short failure
[h264 @ 0x563d7a722400] mmco: unref short failure
[h264 @ 0x55ef0ac62640] mmco: unref short failure
[h264 @ 0x563d79c28ac0] mmco: unref short failure
[h264 @ 0x55ef0ac62640] mmco: unref short failure
[h264 @ 0x563d79c28ac0] mmco: unref short failure
[h264 @ 0x563d7a722400] mmco: unref short failure
[h264 @ 0x563d7a722400] mmco: unref short failure
[h264 @ 0x55ef0ab31f00] mmco: unref short failure
[h264 @ 0x55ef0ab31f00] mmco: unref short failure
[h264 @ 0x55ef0ac62640] mmco: unref short failure
[h264 @ 0x563d79c28ac0] mmco: unref short failure
[h264 @ 0x55ef0ac62640] mmco: unref short failure
[h264 @ 0x55ef0ac62640] mmco: unref short failure
[h264 @ 0x563d79c28ac0] mmco: unref short failure
[h264 @ 0x563d79c28ac0] mmco: unref short failure
[h264 @ 0x55ef0ac62640] mmco: unref short failure
[h264 @ 0x55ef0ac62640] mmco: unref short failure
[h264 @ 0x563d79c28ac0] mmco: unref short failure
[h264 @ 0x563d79c28ac0] mmco: unref short failure
[h264 @ 0x55ef0ac62640] mmco: unref short failure
[h264 @ 0x55ef0ac62640] mmco: unref short failure
[h264 @ 0x563d79c28ac0] mmco: unref short failure
[h264 @ 0x563d79c28ac0] mmco: unref short failure
[h264 @ 0x55ef08ef8240] mmco: unref short failure
[h264 @ 0x55ef08ef8240] mmco: unref short failure
[h264 @ 0x55ef0ac62640] mmco: unref short failure
[h264 @ 0x55ef0ac62640] mmco: unref short failure
[h264 @ 0x563d793dc200] mmco: unref short failure
[h264 @ 0x563d793dc200] mmco: unref short failure
[h264 @ 0x563d79c28ac0] mmco: unref short failure
[h264 @ 0x563d79c28ac0] mmco: unref short failure
[h264 @ 0x55ef0abb24c0] mmco: unref short failure
[h264 @ 0x563d78b74a80] mmco: unref short failure
[h264 @ 0x55ef0ab31f00] mmco: unref short failure
[h264 @ 0x563d7a722400] mmco: unref short failure
[h264 @ 0x55ef0ab31f00] mmco: unref short failure
[h264 @ 0x563d7a722400] mmco: unref short failure
[h264 @ 0x55ef0ab31f00] mmco: unref short failure
[h264 @ 0x55ef0ab31f00] mmco: unref short failure
[h264 @ 0x563d7a722400] mmco: unref short failure
[h264 @ 0x563d7a722400] mmco: unref short failure
[h264 @ 0x55ef08ef8240] mmco: unref short failure
[h264 @ 0x563d793dc200] mmco: unref short failure
[h264 @ 0x55ef0abb24c0] mmco: unref short failure
[h264 @ 0x563d79c29240] mmco: unref short failure
[h264 @ 0x563d78b74a80] mmco: unref short failure
[h264 @ 0x55ef0abb24c0] mmco: unref short failure
[h264 @ 0x55ef0abb24c0] mmco: unref short failure
[h264 @ 0x55ef0abb24c0] mmco: unref short failure
[h264 @ 0x563d7a649840] mmco: unref short failure
[h264 @ 0x563d7a649840] mmco: unref short failure
[h264 @ 0x563d79c28ac0] mmco: unref short failure
[h264 @ 0x55ef0ac62640] mmco: unref short failure
[h264 @ 0x563d79c28ac0] mmco: unref short failure
[h264 @ 0x563d79c28ac0] mmco: unref short failure
[h264 @ 0x55ef0ac62640] mmco: unref short failure
[h264 @ 0x55ef0ac62640] mmco: unref short failure
[h264 @ 0x563d7a649840] mmco: unref short failure
[h264 @ 0x563d7a649840] mmco: unref short failure
[h264 @ 0x55ef0abb24c0] mmco: unref short failure
[h264 @ 0x55ef0abb24c0] mmco: unref short failure
[h264 @ 0x563d7a649840] mmco: unref short failure
[h264 @ 0x55ef0abb24c0] mmco: unref short failure
[h264 @ 0x55ef08630300] mmco: unref short failure
[h264 @ 0x563d7973ac40] mmco: unref short failure
[h264 @ 0x55ef08630300] mmco: unref short failure
[h264 @ 0x563d7973ac40] mmco: unref short failure
[h264 @ 0x55ef08630300] mmco: unref short failure
[h264 @ 0x563d7973ac40] mmco: unref short failure
[h264 @ 0x55ef08630300] mmco: unref short failure
[h264 @ 0x563d7973ac40] mmco: unref short failure
[h264 @ 0x55ef07f7c280] mmco: unref short failure
[h264 @ 0x55ef07f7c280] mmco: unref short failure
[h264 @ 0x563d786d4100] mmco: unref short failure
[h264 @ 0x563d786d4100] mmco: unref short failure
[h264 @ 0x55ef07f7c280] mmco: unref short failure
[h264 @ 0x563d786d4100] mmco: unref short failure
[h264 @ 0x563d78e4e240] mmco: unref short failure
[h264 @ 0x55ef0abb24c0] mmco: unref short failure
[h264 @ 0x55ef07f7c280] mmco: unref short failure
[h264 @ 0x55ef07f7c280] mmco: unref short failure
[h264 @ 0x563d786d4100] mmco: unref short failure
[h264 @ 0x563d786d4100] mmco: unref short failure
[h264 @ 0x55ef07f7c280] mmco: unref short failure
[h264 @ 0x55ef07f7c280] mmco: unref short failure
[h264 @ 0x563d786d4100] mmco: unref short failure
[h264 @ 0x563d786d4100] mmco: unref short failure
[h264 @ 0x55ef07f7c280] mmco: unref short failure
[h264 @ 0x55ef07f7c280] mmco: unref short failure
[h264 @ 0x563d786d4100] mmco: unref short failure
[h264 @ 0x563d786d4100] mmco: unref short failure
[h264 @ 0x55ef07f7c280] mmco: unref short failure
[h264 @ 0x55ef07f7c280] mmco: unref short failure
[h264 @ 0x563d786d4100] mmco: unref short failure
[h264 @ 0x563d786d4100] mmco: unref short failure
[h264 @ 0x55ef0abb24c0] mmco: unref short failure
[h264 @ 0x563d78e4e240] mmco: unref short failure
[h264 @ 0x563d78b74a80] mmco: unref short failure
[h264 @ 0x563d78b74a80] mmco: unref short failure
[h264 @ 0x55ef0abb24c0] mmco: unref short failure
[h264 @ 0x55ef0abb24c0] mmco: unref short failure
[h264 @ 0x563d78b74a80] mmco: unref short failure
[h264 @ 0x563d78b74a80] mmco: unref short failure
[h264 @ 0x55ef0abb24c0] mmco: unref short failure
[h264 @ 0x55ef0abb24c0] mmco: unref short failure
[h264 @ 0x55ef08ef8240] mmco: unref short failure
[h264 @ 0x55ef08ef8240] mmco: unref short failure
[h264 @ 0x563d7e4f3f00] mmco: unref short failure
[h264 @ 0x563d7e4f3f00] mmco: unref short failure
[h264 @ 0x55ef0bbfcdc0] mmco: unref short failure
[h264 @ 0x563d7af1d480] mmco: unref short failure
[h264 @ 0x55ef08aa5000] mmco: unref short failure
[h264 @ 0x55ef08aa5000] mmco: unref short failure
[h264 @ 0x563d7a81ea40] mmco: unref short failure
[h264 @ 0x563d7a81ea40] mmco: unref short failure
[h264 @ 0x55ef08aa5000] mmco: unref short failure
[h264 @ 0x55ef08aa5000] mmco: unref short failure
[h264 @ 0x563d7a81ea40] mmco: unref short failure
[h264 @ 0x563d7a81ea40] mmco: unref short failure
[h264 @ 0x55ef0bbfcdc0] mmco: unref short failure
[h264 @ 0x563d7af1d480] mmco: unref short failure
[h264 @ 0x55ef0bbfcdc0] mmco: unref short failure
[h264 @ 0x55ef0bbfcdc0] mmco: unref short failure
[h264 @ 0x563d7af1d480] mmco: unref short failure
[h264 @ 0x563d7af1d480] mmco: unref short failure
[h264 @ 0x55ef0bbfcdc0] mmco: unref short failure
[h264 @ 0x563d7af1d480] mmco: unref short failure
[h264 @ 0x55ef08aa5000] mmco: unref short failure
[h264 @ 0x563d7a81ea40] mmco: unref short failure
[h264 @ 0x55ef08aa5000] mmco: unref short failure
[h264 @ 0x563d7a81ea40] mmco: unref short failure
[h264 @ 0x55ef0ac69ec0] mmco: unref short failure
[h264 @ 0x55ef08aa5000] mmco: unref short failure
[h264 @ 0x55ef08aa5000] mmco: unref short failure
[h264 @ 0x563d79427c00] mmco: unref short failure
[h264 @ 0x563d7a81ea40] mmco: unref short failure
[h264 @ 0x563d7a81ea40] mmco: unref short failure
[h264 @ 0x563d78481100] mmco: unref short failure
[h264 @ 0x563d78481100] mmco: unref short failure
[h264 @ 0x55ef0c609180] mmco: unref short failure
[h264 @ 0x55ef0c609180] mmco: unref short failure
[h264 @ 0x563d78481100] mmco: unref short failure
[h264 @ 0x55ef0c609180] mmco: unref short failure
[h264 @ 0x563d78481100] mmco: unref short failure
[h264 @ 0x563d78481100] mmco: unref short failure
[h264 @ 0x55ef0c609180] mmco: unref short failure
[h264 @ 0x55ef0c609180] mmco: unref short failure
[h264 @ 0x55ef08aa5000] mmco: unref short failure
[h264 @ 0x563d7a81ea40] mmco: unref short failure
[h264 @ 0x55ef0c609180] mmco: unref short failure
[h264 @ 0x55ef0c609180] mmco: unref short failure
[h264 @ 0x563d78481100] mmco: unref short failure
[h264 @ 0x563d78481100] mmco: unref short failure
[h264 @ 0x55ef08aa5000] mmco: unref short failure
[h264 @ 0x563d7a81ea40] mmco: unref short failure
[h264 @ 0x55ef0c609180] mmco: unref short failure
[h264 @ 0x55ef0c609180] mmco: unref short failure
[h264 @ 0x563d78481100] mmco: unref short failure
[h264 @ 0x563d78481100] mmco: unref short failure
[h264 @ 0x55ef08aa5000] mmco: unref short failure
[h264 @ 0x55ef08aa5000] mmco: unref short failure
[h264 @ 0x563d7a81ea40] mmco: unref short failure
[h264 @ 0x563d7a81ea40] mmco: unref short failure
[h264 @ 0x55ef08aa5000] mmco: unref short failure
[h264 @ 0x55ef08aa5000] mmco: unref short failure
[h264 @ 0x563d7a81ea40] mmco: unref short failure
[h264 @ 0x563d7a81ea40] mmco: unref short failure
[h264 @ 0x563d8113fbc0] mmco: unref short failure
[h264 @ 0x55ef0ac69ec0] mmco: unref short failure
[h264 @ 0x55ef08ef8240] mmco: unref short failure
[h264 @ 0x55ef08ef8240] mmco: unref short failure
[h264 @ 0x563d79570bc0] mmco: unref short failure
[h264 @ 0x563d79570bc0] mmco: unref short failure
[h264 @ 0x563d8113fbc0] mmco: unref short failure
[h264 @ 0x55ef0ac69ec0] mmco: unref short failure
[h264 @ 0x563d8113fbc0] mmco: unref short failure
[h264 @ 0x563d8113fbc0] mmco: unref short failure
[h264 @ 0x55ef0ac69ec0] mmco: unref short failure
[h264 @ 0x55ef0ac69ec0] mmco: unref short failure
[h264 @ 0x55ef0bbfcdc0] mmco: unref short failure
[h264 @ 0x55ef0bbfcdc0] mmco: unref short failure
[h264 @ 0x563d78e4e240] mmco: unref short failure
[h264 @ 0x563d78e4e240] mmco: unref short failure
[h264 @ 0x55ef09fb8c00] mmco: unref short failure
[h264 @ 0x563d79570bc0] mmco: unref short failure
[h264 @ 0x55ef0c609180] mmco: unref short failure
[h264 @ 0x55ef0c609180] mmco: unref short failure
[h264 @ 0x563d78481100] mmco: unref short failure
[h264 @ 0x563d78481100] mmco: unref short failure
[h264 @ 0x55ef0c609180] mmco: unref short failure
[h264 @ 0x55ef0c609180] mmco: unref short failure
[h264 @ 0x563d78481100] mmco: unref short failure
[h264 @ 0x563d78481100] mmco: unref short failure
[h264 @ 0x55ef0c609180] mmco: unref short failure
[h264 @ 0x563d78481100] mmco: unref short failure
[h264 @ 0x55ef0c609180] mmco: unref short failure
[h264 @ 0x55ef0c609180] mmco: unref short failure
[h264 @ 0x563d78481100] mmco: unref short failure
[h264 @ 0x563d78481100] mmco: unref short failure
[h264 @ 0x55ef0c609180] mmco: unref short failure
[h264 @ 0x563d78481100] mmco: unref short failure
[h264 @ 0x55ef09718280] mmco: unref short failure
[h264 @ 0x55ef09718280] mmco: unref short failure
[h264 @ 0x563d7a649840] mmco: unref short failure
[h264 @ 0x563d7a649840] mmco: unref short failure
[h264 @ 0x55ef0abb24c0] mmco: unref short failure
[h264 @ 0x55ef0abb24c0] mmco: unref short failure
[h264 @ 0x563d7a81ea40] mmco: unref short failure
[h264 @ 0x563d7a81ea40] mmco: unref short failure
[h264 @ 0x55ef0abb24c0] mmco: unref short failure
[h264 @ 0x563d7a81ea40] mmco: unref short failure
[h264 @ 0x55ef0abb24c0] mmco: unref short failure
[h264 @ 0x55ef0abb24c0] mmco: unref short failure
[h264 @ 0x563d7a81ea40] mmco: unref short failure
[h264 @ 0x563d7a81ea40] mmco: unref short failure
[h264 @ 0x55ef0aadf440] mmco: unref short failure
[h264 @ 0x563d7aaa36c0] mmco: unref short failure
[h264 @ 0x55ef0aadf440] mmco: unref short failure
[h264 @ 0x55ef0aadf440] mmco: unref short failure
[h264 @ 0x563d7aaa36c0] mmco: unref short failure
[h264 @ 0x563d7aaa36c0] mmco: unref short failure
[h264 @ 0x55ef0aadf440] mmco: unref short failure
[h264 @ 0x55ef0aadf440] mmco: unref short failure
[h264 @ 0x563d7aaa36c0] mmco: unref short failure
[h264 @ 0x563d7aaa36c0] mmco: unref short failure
[h264 @ 0x55ef0aadf440] mmco: unref short failure
[h264 @ 0x563d7aaa36c0] mmco: unref short failure
[h264 @ 0x55ef0aadf440] mmco: unref short failure
[h264 @ 0x563d7aaa36c0] mmco: unref short failure
[h264 @ 0x55ef0aadf440] mmco: unref short failure
[h264 @ 0x563d7aaa36c0] mmco: unref short failure
[h264 @ 0x55ef0abb24c0] mmco: unref short failure
[h264 @ 0x563d78b74a80] mmco: unref short failure
[h264 @ 0x55ef0abb24c0] mmco: unref short failure
[h264 @ 0x563d78b74a80] mmco: unref short failure
[h264 @ 0x55ef0ac69ec0] mmco: unref short failure
[h264 @ 0x563d8113fbc0] mmco: unref short failure
[h264 @ 0x55ef09718280] mmco: unref short failure
[h264 @ 0x55ef09718280] mmco: unref short failure
[h264 @ 0x563d78e4e240] mmco: unref short failure
[h264 @ 0x563d78e4e240] mmco: unref short failure
[h264 @ 0x55ef0d69af00] mmco: unref short failure
[h264 @ 0x55ef0d69af00] mmco: unref short failure
[h264 @ 0x563d7a722400] mmco: unref short failure
[h264 @ 0x563d7a722400] mmco: unref short failure
[h264 @ 0x55ef0941a540] mmco: unref short failure
[h264 @ 0x55ef0941a540] mmco: unref short failure
[h264 @ 0x563d78b74a80] mmco: unref short failure
[h264 @ 0x563d78b74a80] mmco: unref short failure
[h264 @ 0x55ef0941a540] mmco: unref short failure
[h264 @ 0x563d78b74a80] mmco: unref short failure
[h264 @ 0x55ef09e96ac0] mmco: unref short failure
[h264 @ 0x563d7990bd80] mmco: unref short failure
[h264 @ 0x55ef09e96ac0] mmco: unref short failure
[h264 @ 0x55ef09e96ac0] mmco: unref short failure
[h264 @ 0x563d7990bd80] mmco: unref short failure
[h264 @ 0x563d7990bd80] mmco: unref short failure
[h264 @ 0x563d8113fbc0] mmco: unref short failure
[h264 @ 0x55ef0ac69ec0] mmco: unref short failure
[h264 @ 0x55ef09e96ac0] mmco: unref short failure
[h264 @ 0x55ef09e96ac0] mmco: unref short failure
[h264 @ 0x563d7990bd80] mmco: unref short failure
[h264 @ 0x563d7990bd80] mmco: unref short failure
[h264 @ 0x563d8113fbc0] mmco: unref short failure
[h264 @ 0x563d8113fbc0] mmco: unref short failure
[h264 @ 0x563d8113fbc0] mmco: unref short failure
[h264 @ 0x55ef0ac69ec0] mmco: unref short failure
[h264 @ 0x55ef0ac69ec0] mmco: unref short failure
[h264 @ 0x55ef0ac69ec0] mmco: unref short failure
[h264 @ 0x563d7990bd80] mmco: unref short failure
[h264 @ 0x55ef09e96ac0] mmco: unref short failure
[h264 @ 0x55ef0a8389c0] mmco: unref short failure
[h264 @ 0x563d7b050400] mmco: unref short failure
[h264 @ 0x55ef0a169480] mmco: unref short failure
[h264 @ 0x563d8113fbc0] mmco: unref short failure
[h264 @ 0x55ef0a169480] mmco: unref short failure
[h264 @ 0x55ef0a169480] mmco: unref short failure
[h264 @ 0x563d8113fbc0] mmco: unref short failure
[h264 @ 0x563d8113fbc0] mmco: unref short failure
[h264 @ 0x55ef0a169480] mmco: unref short failure
[h264 @ 0x55ef0a169480] mmco: unref short failure
[h264 @ 0x563d8113fbc0] mmco: unref short failure
[h264 @ 0x563d8113fbc0] mmco: unref short failure
[h264 @ 0x55ef0a169480] mmco: unref short failure
[h264 @ 0x55ef0a169480] mmco: unref short failure
[h264 @ 0x563d8113fbc0] mmco: unref short failure
[h264 @ 0x563d8113fbc0] mmco: unref short failure
[h264 @ 0x55ef098f0800] mmco: unref short failure
[h264 @ 0x55ef098f0800] mmco: unref short failure
[h264 @ 0x563d7944a640] mmco: unref short failure
[h264 @ 0x563d7944a640] mmco: unref short failure
[h264 @ 0x55ef098f0800] mmco: unref short failure
[h264 @ 0x563d7944a640] mmco: unref short failure
[h264 @ 0x563d78481100] mmco: unref short failure
[h264 @ 0x55ef098f0800] mmco: unref short failure
[h264 @ 0x563d79c28ac0] mmco: unref short failure
[h264 @ 0x55ef09718280] mmco: unref short failure
[h264 @ 0x55ef0a169480] mmco: unref short failure
[h264 @ 0x563d8113fbc0] mmco: unref short failure
[h264 @ 0x55ef0a169480] mmco: unref short failure
[h264 @ 0x563d8113fbc0] mmco: unref short failure
[h264 @ 0x55ef0a169480] mmco: unref short failure
[h264 @ 0x55ef0a169480] mmco: unref short failure
[h264 @ 0x563d8113fbc0] mmco: unref short failure
[h264 @ 0x563d8113fbc0] mmco: unref short failure
[h264 @ 0x563d78481100] mmco: unref short failure
[h264 @ 0x55ef098f0800] mmco: unref short failure
[h264 @ 0x563d78481100] mmco: unref short failure
[h264 @ 0x563d78481100] mmco: unref short failure
[h264 @ 0x55ef098f0800] mmco: unref short failure
[h264 @ 0x55ef098f0800] mmco: unref short failure
[h264 @ 0x55ef098f0800] mmco: unref short failure
[h264 @ 0x563d78481100] mmco: unref short failure
[h264 @ 0x563d7a3e47c0] mmco: unref short failure
[h264 @ 0x55ef113c5d00] mmco: unref short failure
[h264 @ 0x563d7a3e47c0] mmco: unref short failure
[h264 @ 0x55ef113c5d00] mmco: unref short failure
[h264 @ 0x55ef0a823a00] mmco: unref short failure
[h264 @ 0x55ef0a823a00] mmco: unref short failure
[h264 @ 0x563d791e6cc0] mmco: unref short failure
[h264 @ 0x563d791e6cc0] mmco: unref short failure
[h264 @ 0x55ef0a169480] mmco: unref short failure
[h264 @ 0x563d78c05240] mmco: unref short failure
[h264 @ 0x55ef0a169480] mmco: unref short failure
[h264 @ 0x563d8113fbc0] mmco: unref short failure
[h264 @ 0x55ef0a1c5980] mmco: unref short failure
[h264 @ 0x55ef0a1c5980] mmco: unref short failure
[h264 @ 0x563d7de7dd00] mmco: unref short failure
[h264 @ 0x563d7de7dd00] mmco: unref short failure
[h264 @ 0x55ef0a823a00] mmco: unref short failure
[h264 @ 0x563d791e6cc0] mmco: unref short failure
[h264 @ 0x55ef0a823a00] mmco: unref short failure
[h264 @ 0x55ef0a823a00] mmco: unref short failure
[h264 @ 0x563d791e6cc0] mmco: unref short failure
[h264 @ 0x563d791e6cc0] mmco: unref short failure
[h264 @ 0x55ef0a823a00] mmco: unref short failure
[h264 @ 0x55ef0a823a00] mmco: unref short failure
[h264 @ 0x55ef0a823a00] mmco: unref short failure
[h264 @ 0x55ef0a823a00] mmco: unref short failure
[h264 @ 0x563d791e6cc0] mmco: unref short failure
[h264 @ 0x563d791e6cc0] mmco: unref short failure
[h264 @ 0x563d791e6cc0] mmco: unref short failure
[h264 @ 0x563d791e6cc0] mmco: unref short failure
[h264 @ 0x55ef0a823a00] mmco: unref short failure
[h264 @ 0x55ef0a823a00] mmco: unref short failure
[h264 @ 0x563d791e6cc0] mmco: unref short failure
[h264 @ 0x563d791e6cc0] mmco: unref short failure
[h264 @ 0x55ef0aadf440] mmco: unref short failure
[h264 @ 0x563d7de7dd00] mmco: unref short failure
[h264 @ 0x55ef09718280] mmco: unref short failure
[h264 @ 0x563d8113fbc0] mmco: unref short failure
[h264 @ 0x55ef0a169480] mmco: unref short failure
[h264 @ 0x563d78c05240] mmco: unref short failure
[h264 @ 0x55ef0a169480] mmco: unref short failure
[h264 @ 0x55ef0a169480] mmco: unref short failure
[h264 @ 0x55ef0a169480] mmco: unref short failure
[h264 @ 0x55ef0a169480] mmco: unref short failure
[h264 @ 0x563d78c05240] mmco: unref short failure
[h264 @ 0x563d78c05240] mmco: unref short failure
[h264 @ 0x55ef0a169480] mmco: unref short failure
[h264 @ 0x55ef0a169480] mmco: unref short failure
[h264 @ 0x563d78c05240] mmco: unref short failure
[h264 @ 0x563d78c05240] mmco: unref short failure
[h264 @ 0x563d78c05240] mmco: unref short failure
[h264 @ 0x563d78c05240] mmco: unref short failure
[h264 @ 0x55ef0a169480] mmco: unref short failure
[h264 @ 0x563d78c05240] mmco: unref short failure
[h264 @ 0x55ef0a169480] mmco: unref short failure
[h264 @ 0x55ef0a169480] mmco: unref short failure
[h264 @ 0x55ef0a169480] mmco: unref short failure
[h264 @ 0x55ef0a169480] mmco: unref short failure
[h264 @ 0x563d78c05240] mmco: unref short failure
[h264 @ 0x563d78c05240] mmco: unref short failure
[h264 @ 0x563d78c05240] mmco: unref short failure
[h264 @ 0x563d78c05240] mmco: unref short failure
saving checkpoint at iteration     200 to /data_2/output/LM/scripts/modellink/qwen25/finetune_qwen25_14b_intern_300m_ptd_tp8pp1cp2_stage3.sh/20241127_204213// in torch format